From 840ff9222103a2988b15d6d60415d56f7c81f863 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 17 Dec 2025 14:04:45 +0100 Subject: [PATCH 001/187] [Docs] Reflect the 0.20 changes related to `working_dir` and `repo_dir` (#3356) * [Docs] Reflect the 0.20 changes related to `working_dir` and `repo_dir` (WIP) * [Docs] Reflect the 0.20 changes related to `working_dir` and `repo_dir` --- docs/docs/concepts/dev-environments.md | 53 ++++++++----- docs/docs/concepts/services.md | 69 +++++++++++------ docs/docs/concepts/tasks.md | 103 +++++++++++++++---------- 3 files changed, 139 insertions(+), 86 deletions(-) diff --git a/docs/docs/concepts/dev-environments.md b/docs/docs/concepts/dev-environments.md index 4d46e73ac4..bda3406a61 100644 --- a/docs/docs/concepts/dev-environments.md +++ b/docs/docs/concepts/dev-environments.md @@ -301,11 +301,11 @@ If you don't assign a value to an environment variable (see `HF_TOKEN` above), ### Working directory -If `working_dir` is not specified, it defaults to `/workflow`. +If `working_dir` is not specified, it defaults to the working directory set in the Docker image. For example, the [default image](#default-image) uses `/dstack/run` as its working directory. -The `working_dir` must be an absolute path. The tilde (`~`) is supported (e.g., `~/my-working-dir`). +If the Docker image does not have a working directory set, `dstack` uses `/` as the `working_dir`. - +The `working_dir` must be an absolute path. The tilde (`~`) is supported (e.g., `~/my-working-dir`). @@ -320,7 +320,7 @@ type: dev-environment name: vscode files: - - .:examples # Maps the directory where `.dstack.yml` to `/workflow/examples` + - .:examples # Maps the directory with `.dstack.yml` to `/examples` - ~/.ssh/id_rsa:/root/.ssh/id_rsa # Maps `~/.ssh/id_rsa` to `/root/.ssh/id_rsa` ide: vscode @@ -329,7 +329,7 @@ ide: vscode If the local path is relative, it’s resolved relative to the configuration file. -If the container path is relative, it’s resolved relative to `/workflow`. +If the container path is relative, it’s resolved relative to the [working directory](#working-directory). The container path is optional. If not specified, it will be automatically calculated: @@ -340,7 +340,7 @@ type: dev-environment name: vscode files: - - ../examples # Maps `examples` (the parent directory of `.dstack.yml`) to `/workflow/examples` + - ../examples # Maps the parent directory of `.dstack.yml` to `/../examples` - ~/.ssh/id_rsa # Maps `~/.ssh/id_rsa` to `/root/.ssh/id_rsa` ide: vscode @@ -355,9 +355,9 @@ ide: vscode ### Repos -Sometimes, you may want to mount an entire Git repo inside the container. +Sometimes, you may want to clone an entire Git repo inside the container. -Imagine you have a cloned Git repo containing an `examples` subdirectory with a `.dstack.yml` file: +Imagine you have a Git repo (clonned locally) containing an `examples` subdirectory with a `.dstack.yml` file:
@@ -366,8 +366,7 @@ type: dev-environment name: vscode repos: - # Mounts the parent directory of `examples` (must be a Git repo) - # to `/workflow` (the default working directory) + # Clones the repo from the parent directory (`examples/..`) to `` - .. ide: vscode @@ -375,15 +374,13 @@ ide: vscode
-When you run it, `dstack` fetches the repo on the instance, applies your local changes, and mounts it—so the container matches your local repo. +When you run it, `dstack` clones the repo on the instance, applies your local changes, and mounts it—so the container matches your local repo. The local path can be either relative to the configuration file or absolute. ??? info "Repo directory" - By default, `dstack` mounts the repo to `/workflow` (the default working directory). + By default, `dstack` clones the repo to the [working directory](#working-directory). - - You can override the repo directory using either a relative or an absolute path:
@@ -393,8 +390,7 @@ The local path can be either relative to the configuration file or absolute. name: vscode repos: - # Mounts the parent directory of `examples` (must be a Git repo) - # to `/my-repo` + # Clones the repo in the parent directory (`examples/..`) to `/my-repo` - ..:/my-repo ide: vscode @@ -402,7 +398,22 @@ The local path can be either relative to the configuration file or absolute.
- If the path is relative, it is resolved against [working directory](#working-directory). + > If the repo directory is relative, it is resolved against [working directory](#working-directory). + + If the repo directory is not empty, the run will fail with a runner error. + To override this behavior, you can set `if_exists` to `skip`: + + ```yaml + type: dev-environment + name: vscode + + repos: + - local_path: .. + path: /my-repo + if_exists: skip + + ide: vscode + ``` ??? info "Repo size" @@ -411,7 +422,7 @@ The local path can be either relative to the configuration file or absolute. You can increase the 2MB limit by setting the `DSTACK_SERVER_CODE_UPLOAD_LIMIT` environment variable. ??? info "Repo URL" - Sometimes you may want to mount a Git repo without cloning it locally. In this case, simply provide a URL in `repos`: + Sometimes you may want to clone a Git repo within the container without cloning it locally. In this case, simply provide a URL in `repos`:
@@ -420,7 +431,7 @@ The local path can be either relative to the configuration file or absolute. name: vscode repos: - # Clone the specified repo to `/workflow` (the default working directory) + # Clone the repo to `` - https://github.com/dstackai/dstack ide: vscode @@ -432,9 +443,9 @@ The local path can be either relative to the configuration file or absolute. If a Git repo is private, `dstack` will automatically try to use your default Git credentials (from `~/.ssh/config` or `~/.config/gh/hosts.yml`). - If you want to use custom credentials, you can provide them with [`dstack init`](../reference/cli/dstack/init.md). + > If you want to use custom credentials, ensure to pass them via [`dstack init`](../reference/cli/dstack/init.md) before submitting a run. -> Currently, you can configure up to one repo per run configuration. +Currently, you can configure up to one repo per run configuration. ### Retry policy diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index 24a0187de8..745f78e3f0 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -597,15 +597,12 @@ resources: ### Working directory -If `working_dir` is not specified, it defaults to `/workflow`. +If `working_dir` is not specified, it defaults to the working directory set in the Docker image. For example, the [default image](#default-image) uses `/dstack/run` as its working directory. -!!! info "No commands" - If you’re using a custom `image` without `commands`, then `working_dir` is taken from `image`. +If the Docker image does not have a working directory set, `dstack` uses `/` as the `working_dir`. The `working_dir` must be an absolute path. The tilde (`~`) is supported (e.g., `~/my-working-dir`). - - ### Files @@ -621,7 +618,7 @@ type: service name: llama-2-7b-service files: - - .:examples # Maps the directory where `.dstack.yml` to `/workflow/examples` + - .:examples # Maps the directory with `.dstack.yml` to `/examples` - ~/.ssh/id_rsa:/root/.ssh/id_rsa # Maps `~/.ssh/id_rsa` to `/root/.ssh/id_rsa` python: 3.12 @@ -640,11 +637,10 @@ resources:
-Each entry maps a local directory or file to a path inside the container. Both local and container paths can be relative or absolute. - -If the local path is relative, it’s resolved relative to the configuration file. If the container path is relative, it’s resolved relative to `/workflow`. +If the local path is relative, it’s resolved relative to the configuration file. +If the container path is relative, it’s resolved relative to the [working directory](#working-directory). -The container path is optional. If not specified, it will be automatically calculated. +The container path is optional. If not specified, it will be automatically calculated: @@ -655,7 +651,7 @@ type: service name: llama-2-7b-service files: - - ../examples # Maps `examples` (the parent directory of `.dstack.yml`) to `/workflow/examples` + - ../examples # Maps the parent directory of `.dstack.yml` to `/../examples` - ~/.ssh/id_rsa # Maps `~/.ssh/id_rsa` to `/root/.ssh/id_rsa` python: 3.12 @@ -681,9 +677,9 @@ resources: ### Repos -Sometimes, you may want to mount an entire Git repo inside the container. +Sometimes, you may want to clone an entire Git repo inside the container. -Imagine you have a cloned Git repo containing an `examples` subdirectory with a `.dstack.yml` file: +Imagine you have a Git repo (clonned locally) containing an `examples` subdirectory with a `.dstack.yml` file: @@ -694,8 +690,7 @@ type: service name: llama-2-7b-service repos: - # Mounts the parent directory of `examples` (must be a Git repo) - # to `/workflow` (the default working directory) + # Clones the repo from the parent directory (`examples/..`) to `` - .. python: 3.12 @@ -714,12 +709,12 @@ resources: -When you run it, `dstack` fetches the repo on the instance, applies your local changes, and mounts it—so the container matches your local repo. +When you run it, `dstack` clones the repo on the instance, applies your local changes, and mounts it—so the container matches your local repo. The local path can be either relative to the configuration file or absolute. ??? info "Repo directory" - By default, `dstack` mounts the repo to `/workflow` (the default working directory). + By default, `dstack` clones the repo to the [working directory](#working-directory). @@ -732,8 +727,7 @@ The local path can be either relative to the configuration file or absolute. name: llama-2-7b-service repos: - # Mounts the parent directory of `examples` (must be a Git repo) - # to `/my-repo` + # Clones the repo in the parent directory (`examples/..`) to `/my-repo` - ..:/my-repo python: 3.12 @@ -752,7 +746,33 @@ The local path can be either relative to the configuration file or absolute. - If the path is relative, it is resolved against `working_dir`. + > If the repo directory is relative, it is resolved against [working directory](#working-directory). + + If the repo directory is not empty, the run will fail with a runner error. + To override this behavior, you can set `if_exists` to `skip`: + + ```yaml + type: service + name: llama-2-7b-service + + repos: + - local_path: .. + path: /my-repo + if_exists: skip + + python: 3.12 + + env: + - HF_TOKEN + - MODEL=NousResearch/Llama-2-7b-chat-hf + commands: + - uv pip install vllm + - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000 + port: 8000 + + resources: + gpu: 24GB + ``` ??? info "Repo size" The repo size is not limited. However, local changes are limited to 2MB. @@ -760,8 +780,7 @@ The local path can be either relative to the configuration file or absolute. You can increase the 2MB limit by setting the `DSTACK_SERVER_CODE_UPLOAD_LIMIT` environment variable. ??? info "Repo URL" - - Sometimes you may want to mount a Git repo without cloning it locally. In this case, simply provide a URL in `repos`: + Sometimes you may want to clone a Git repo within the container without cloning it locally. In this case, simply provide a URL in `repos`: @@ -772,7 +791,7 @@ The local path can be either relative to the configuration file or absolute. name: llama-2-7b-service repos: - # Clone the specified repo to `/workflow` (the default working directory) + # Clone the repo to `` - https://github.com/dstackai/dstack python: 3.12 @@ -795,9 +814,9 @@ The local path can be either relative to the configuration file or absolute. If a Git repo is private, `dstack` will automatically try to use your default Git credentials (from `~/.ssh/config` or `~/.config/gh/hosts.yml`). - If you want to use custom credentials, you can provide them with [`dstack init`](../reference/cli/dstack/init.md). + > If you want to use custom credentials, you can provide them with [`dstack init`](../reference/cli/dstack/init.md). -> Currently, you can configure up to one repo per run configuration. +Currently, you can configure up to one repo per run configuration. ### Retry policy diff --git a/docs/docs/concepts/tasks.md b/docs/docs/concepts/tasks.md index ef3d3e85b6..ac94415d4d 100644 --- a/docs/docs/concepts/tasks.md +++ b/docs/docs/concepts/tasks.md @@ -32,7 +32,7 @@ commands: - uv pip install trl - | trl sft \ - --model_name_or_path $MODEL --dataset_name $DATASET + --model_name_or_path $MODEL --dataset_name $DATASET \ --num_processes $DSTACK_GPUS_PER_NODE resources: @@ -199,7 +199,7 @@ commands: - uv pip install trl - | trl sft \ - --model_name_or_path $MODEL --dataset_name $DATASET + --model_name_or_path $MODEL --dataset_name $DATASET \ --num_processes $DSTACK_GPUS_PER_NODE resources: @@ -276,7 +276,7 @@ commands: - uv pip install trl - | trl sft \ - --model_name_or_path $MODEL --dataset_name $DATASET + --model_name_or_path $MODEL --dataset_name $DATASET \ --num_processes $DSTACK_GPUS_PER_NODE resources: @@ -417,7 +417,7 @@ resources: ```yaml type: task -name: trl-sft +name: trl-sft python: 3.12 @@ -431,7 +431,7 @@ commands: - uv pip install trl - | trl sft \ - --model_name_or_path $MODEL --dataset_name $DATASET + --model_name_or_path $MODEL --dataset_name $DATASET \ --num_processes $DSTACK_GPUS_PER_NODE resources: @@ -463,15 +463,12 @@ If you don't assign a value to an environment variable (see `HF_TOKEN` above), ### Working directory -If `working_dir` is not specified, it defaults to `/workflow`. +If `working_dir` is not specified, it defaults to the working directory set in the Docker image. For example, the [default image](#default-image) uses `/dstack/run` as its working directory. -!!! info "No commands" - If you’re using a custom `image` without `commands`, then `working_dir` is taken from `image`. +If the Docker image does not have a working directory set, `dstack` uses `/` as the `working_dir`. The `working_dir` must be an absolute path. The tilde (`~`) is supported (e.g., `~/my-working-dir`). - - ### Files @@ -485,7 +482,7 @@ type: task name: trl-sft files: - - .:examples # Maps the directory where `.dstack.yml` to `/workflow/examples` + - .:examples # Maps the directory with `.dstack.yml` to `/examples` - ~/.ssh/id_rsa:/root/.ssh/id_rsa # Maps `~/.ssh/id_rsa` to `/root/.ssh/id_rs python: 3.12 @@ -500,7 +497,7 @@ commands: - uv pip install trl - | trl sft \ - --model_name_or_path $MODEL --dataset_name $DATASET + --model_name_or_path $MODEL --dataset_name $DATASET \ --num_processes $DSTACK_GPUS_PER_NODE resources: @@ -509,11 +506,10 @@ resources: -Each entry maps a local directory or file to a path inside the container. Both local and container paths can be relative or absolute. +If the local path is relative, it’s resolved relative to the configuration file. +If the container path is relative, it’s resolved relative to the [working directory](#working-directory). -If the local path is relative, it’s resolved relative to the configuration file. If the container path is relative, it’s resolved relative to `/workflow`. - -The container path is optional. If not specified, it will be automatically calculated. +The container path is optional. If not specified, it will be automatically calculated: @@ -521,11 +517,11 @@ The container path is optional. If not specified, it will be automatically calcu ```yaml type: task -name: trl-sft +name: trl-sft files: - - ../examples # Maps `examples` (the parent directory of `.dstack.yml`) to `/workflow/examples` - - ~/.cache/huggingface/token # Maps `~/.cache/huggingface/token` to `/root/~/.cache/huggingface/token` + - ../examples # Maps the parent directory of `.dstack.yml` to `/../examples` + - ~/.cache/huggingface/token # Maps `~/.cache/huggingface/token` to `/root/.cache/huggingface/token` python: 3.12 @@ -539,7 +535,7 @@ commands: - uv pip install trl - | trl sft \ - --model_name_or_path $MODEL --dataset_name $DATASET + --model_name_or_path $MODEL --dataset_name $DATASET \ --num_processes $DSTACK_GPUS_PER_NODE resources: @@ -555,9 +551,9 @@ resources: ### Repos -Sometimes, you may want to mount an entire Git repo inside the container. +Sometimes, you may want to clone an entire Git repo inside the container. -Imagine you have a cloned Git repo containing an `examples` subdirectory with a `.dstack.yml` file: +Imagine you have a Git repo (clonned locally) containing an `examples` subdirectory with a `.dstack.yml` file: @@ -565,11 +561,10 @@ Imagine you have a cloned Git repo containing an `examples` subdirectory with a ```yaml type: task -name: trl-sft +name: trl-sft repos: - # Mounts the parent directory of `examples` (must be a Git repo) - # to `/workflow` (the default working directory) + # Clones the repo from the parent directory (`examples/..`) to `` - .. python: 3.12 @@ -584,7 +579,7 @@ commands: - uv pip install trl - | trl sft \ - --model_name_or_path $MODEL --dataset_name $DATASET + --model_name_or_path $MODEL --dataset_name $DATASET \ --num_processes $DSTACK_GPUS_PER_NODE resources: @@ -593,26 +588,23 @@ resources: -When you run it, `dstack` fetches the repo on the instance, applies your local changes, and mounts it—so the container matches your local repo. +When you run it, `dstack` clones the repo on the instance, applies your local changes, and mounts it—so the container matches your local repo. The local path can be either relative to the configuration file or absolute. ??? info "Repo directory" - By default, `dstack` mounts the repo to `/workflow` (the default working directory). + By default, `dstack` clones the repo to the [working directory](#working-directory). - - You can override the repo directory using either a relative or an absolute path:
```yaml type: task - name: trl-sft + name: trl-sft repos: - # Mounts the parent directory of `examples` (must be a Git repo) - # to `/my-repo` + # Clones the repo in the parent directory (`examples/..`) to `/my-repo` - ..:/my-repo python: 3.12 @@ -627,7 +619,7 @@ The local path can be either relative to the configuration file or absolute. - uv pip install trl - | trl sft \ - --model_name_or_path $MODEL --dataset_name $DATASET + --model_name_or_path $MODEL --dataset_name $DATASET \ --num_processes $DSTACK_GPUS_PER_NODE resources: @@ -636,7 +628,38 @@ The local path can be either relative to the configuration file or absolute.
- If the path is relative, it is resolved against [working directory](#working-directory). + > If the repo directory is relative, it is resolved against [working directory](#working-directory). + + If the repo directory is not empty, the run will fail with a runner error. + To override this behavior, you can set `if_exists` to `skip`: + + ```yaml + type: task + name: trl-sft + + repos: + - local_path: .. + path: /my-repo + if_exists: skip + + python: 3.12 + + env: + - HF_TOKEN + - HF_HUB_ENABLE_HF_TRANSFER=1 + - MODEL=Qwen/Qwen2.5-0.5B + - DATASET=stanfordnlp/imdb + + commands: + - uv pip install trl + - | + trl sft \ + --model_name_or_path $MODEL --dataset_name $DATASET \ + --num_processes $DSTACK_GPUS_PER_NODE + + resources: + gpu: H100:1 + ``` ??? info "Repo size" The repo size is not limited. However, local changes are limited to 2MB. @@ -644,7 +667,7 @@ The local path can be either relative to the configuration file or absolute. You can increase the 2MB limit by setting the `DSTACK_SERVER_CODE_UPLOAD_LIMIT` environment variable. ??? info "Repo URL" - Sometimes you may want to mount a Git repo without cloning it locally. In this case, simply provide a URL in `repos`: + Sometimes you may want to clone a Git repo within the container without cloning it locally. In this case, simply provide a URL in `repos`: @@ -655,7 +678,7 @@ The local path can be either relative to the configuration file or absolute. name: trl-sft repos: - # Clone the specified repo to `/workflow` (the default working directory) + # Clone the repo to `` - https://github.com/dstackai/dstack python: 3.12 @@ -670,7 +693,7 @@ The local path can be either relative to the configuration file or absolute. - uv pip install trl - | trl sft \ - --model_name_or_path $MODEL --dataset_name $DATASET + --model_name_or_path $MODEL --dataset_name $DATASET \ --num_processes $DSTACK_GPUS_PER_NODE resources: @@ -683,9 +706,9 @@ The local path can be either relative to the configuration file or absolute. If a Git repo is private, `dstack` will automatically try to use your default Git credentials (from `~/.ssh/config` or `~/.config/gh/hosts.yml`). - If you want to use custom credentials, you can provide them with [`dstack init`](../reference/cli/dstack/init.md). + > If you want to use custom credentials, you can provide them with [`dstack init`](../reference/cli/dstack/init.md). -> Currently, you can configure up to one repo per run configuration. +Currently, you can configure up to one repo per run configuration. ### Retry policy From e74332adacb4a728cacbc8e719662e16ee058215 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Wed, 17 Dec 2025 13:07:53 +0000 Subject: [PATCH 002/187] [Docs]: Fix environment variables reference layout (#3396) --- docs/docs/reference/environment-variables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/reference/environment-variables.md b/docs/docs/reference/environment-variables.md index 10bf723b3a..4575f1b8f8 100644 --- a/docs/docs/reference/environment-variables.md +++ b/docs/docs/reference/environment-variables.md @@ -131,7 +131,7 @@ For more details on the options below, refer to the [server deployment](../guide - `DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS`{ #DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS } – Maximum age of metrics samples for finished jobs. - `DSTACK_SERVER_INSTANCE_HEALTH_TTL_SECONDS`{ #DSTACK_SERVER_INSTANCE_HEALTH_TTL_SECONDS } – Maximum age of instance health checks. - `DSTACK_SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS`{ #DSTACK_SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS } – Minimum time interval between consecutive health checks of the same instance. -- `DSTACK_SERVER_EVENTS_TTL_SECONDS` { #DSTACK_SERVER_EVENTS_TTL_SECONDS } - Maximum age of event records. Set to `0` to disable event storage. Defaults to 30 days. +- `DSTACK_SERVER_EVENTS_TTL_SECONDS`{ #DSTACK_SERVER_EVENTS_TTL_SECONDS } - Maximum age of event records. Set to `0` to disable event storage. Defaults to 30 days. ??? info "Internal environment variables" The following environment variables are intended for development purposes: From 6f647432137415825bac939b1fe3dfea19ca8e92 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Thu, 18 Dec 2025 07:53:52 +0000 Subject: [PATCH 003/187] Add more events about users and projects (#3390) - User updated - User token refreshed - User SSH key refreshed - User deleted - Project updated - Project deleted Also refactor the implementation of the relevant operations on users to enable more detailed event messages and to avoid race conditions and longer write transactions. --- src/dstack/_internal/server/routers/runs.py | 4 +- src/dstack/_internal/server/routers/users.py | 9 +- .../_internal/server/services/projects.py | 19 +- src/dstack/_internal/server/services/users.py | 211 +++++++++++------- .../_internal/server/routers/test_projects.py | 10 + .../_internal/server/routers/test_users.py | 13 ++ 6 files changed, 177 insertions(+), 89 deletions(-) diff --git a/src/dstack/_internal/server/routers/runs.py b/src/dstack/_internal/server/routers/runs.py index 24baee9179..a4a09b3fb8 100644 --- a/src/dstack/_internal/server/routers/runs.py +++ b/src/dstack/_internal/server/routers/runs.py @@ -118,7 +118,7 @@ async def get_plan( """ user, project = user_project if not user.ssh_public_key and not body.run_spec.ssh_key_pub: - await users.refresh_ssh_key(session=session, user=user) + await users.refresh_ssh_key(session=session, actor=user) run_plan = await runs.get_plan( session=session, project=project, @@ -148,7 +148,7 @@ async def apply_plan( """ user, project = user_project if not user.ssh_public_key and not body.plan.run_spec.ssh_key_pub: - await users.refresh_ssh_key(session=session, user=user) + await users.refresh_ssh_key(session=session, actor=user) return CustomORJSONResponse( await runs.apply_plan( session=session, diff --git a/src/dstack/_internal/server/routers/users.py b/src/dstack/_internal/server/routers/users.py index 2568c6ac29..1feac5da36 100644 --- a/src/dstack/_internal/server/routers/users.py +++ b/src/dstack/_internal/server/routers/users.py @@ -43,7 +43,7 @@ async def get_my_user( ): if user.ssh_private_key is None or user.ssh_public_key is None: # Generate keys for pre-0.19.33 users - await users.refresh_ssh_key(session=session, user=user) + await users.refresh_ssh_key(session=session, actor=user) return CustomORJSONResponse(users.user_model_to_user_with_creds(user)) @@ -86,6 +86,7 @@ async def update_user( ): res = await users.update_user( session=session, + actor=user, username=body.username, global_role=body.global_role, email=body.email, @@ -102,7 +103,7 @@ async def refresh_ssh_key( session: AsyncSession = Depends(get_session), user: UserModel = Depends(Authenticated()), ): - res = await users.refresh_ssh_key(session=session, user=user, username=body.username) + res = await users.refresh_ssh_key(session=session, actor=user, username=body.username) if res is None: raise ResourceNotExistsError() return CustomORJSONResponse(users.user_model_to_user_with_creds(res)) @@ -114,7 +115,7 @@ async def refresh_token( session: AsyncSession = Depends(get_session), user: UserModel = Depends(Authenticated()), ): - res = await users.refresh_user_token(session=session, user=user, username=body.username) + res = await users.refresh_user_token(session=session, actor=user, username=body.username) if res is None: raise ResourceNotExistsError() return CustomORJSONResponse(users.user_model_to_user_with_creds(res)) @@ -128,6 +129,6 @@ async def delete_users( ): await users.delete_users( session=session, - user=user, + actor=user, usernames=body.users, ) diff --git a/src/dstack/_internal/server/services/projects.py b/src/dstack/_internal/server/services/projects.py index 2004b5cccd..5e4842df56 100644 --- a/src/dstack/_internal/server/services/projects.py +++ b/src/dstack/_internal/server/services/projects.py @@ -169,8 +169,16 @@ async def update_project( project: ProjectModel, is_public: bool, ): - """Update project visibility (public/private).""" - project.is_public = is_public + updated_fields = [] + if is_public != project.is_public: + project.is_public = is_public + updated_fields.append(f"is_public={is_public}") + events.emit( + session, + f"Project updated. Updated fields: {', '.join(updated_fields) or ''}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project)], + ) await session.commit() @@ -222,9 +230,14 @@ async def delete_projects( "deleted": True, } ) + events.emit( + session, + "Project deleted", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(p)], + ) await session.execute(update(ProjectModel), updates) await session.commit() - logger.info("Deleted projects %s by user %s", projects_names, user.name) async def set_project_members( diff --git a/src/dstack/_internal/server/services/users.py b/src/dstack/_internal/server/services/users.py index 62fcc848ea..e8fbcde782 100644 --- a/src/dstack/_internal/server/services/users.py +++ b/src/dstack/_internal/server/services/users.py @@ -3,14 +3,19 @@ import re import secrets import uuid +from collections.abc import AsyncGenerator +from contextlib import asynccontextmanager from typing import Awaitable, Callable, List, Optional, Tuple -from sqlalchemy import delete, select, update +from sqlalchemy import delete, select from sqlalchemy import func as safunc from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import load_only -from dstack._internal.core.errors import ResourceExistsError, ServerClientError +from dstack._internal.core.errors import ( + ResourceExistsError, + ServerClientError, +) from dstack._internal.core.models.users import ( GlobalRole, User, @@ -19,8 +24,10 @@ UserTokenCreds, UserWithCreds, ) +from dstack._internal.server.db import get_db from dstack._internal.server.models import DecryptedString, MemberModel, UserModel from dstack._internal.server.services import events +from dstack._internal.server.services.locking import get_locker from dstack._internal.server.services.permissions import get_default_permissions from dstack._internal.server.utils.routers import error_forbidden from dstack._internal.utils import crypto @@ -123,114 +130,128 @@ async def create_user( async def update_user( session: AsyncSession, + actor: UserModel, username: str, global_role: GlobalRole, email: Optional[str] = None, active: bool = True, -) -> UserModel: - await session.execute( - update(UserModel) - .where( - UserModel.name == username, - UserModel.deleted == False, - ) - .values( - global_role=global_role, - email=email, - active=active, +) -> Optional[UserModel]: + async with get_user_model_by_name_for_update(session, username) as user: + if user is None: + return None + updated_fields = [] + if global_role != user.global_role: + user.global_role = global_role + updated_fields.append(f"global_role={global_role}") + if email != user.email: + user.email = email + updated_fields.append("email") # do not include potentially sensitive new value + if active != user.active: + user.active = active + updated_fields.append(f"active={active}") + events.emit( + session, + f"User updated. Updated fields: {', '.join(updated_fields) or ''}", + actor=events.UserActor.from_user(actor), + targets=[events.Target.from_model(user)], ) - ) - await session.commit() - return await get_user_model_by_name_or_error(session=session, username=username) + await session.commit() + return user async def refresh_ssh_key( session: AsyncSession, - user: UserModel, + actor: UserModel, username: Optional[str] = None, ) -> Optional[UserModel]: if username is None: - username = user.name - logger.debug("Refreshing SSH key for user [code]%s[/code]", username) - if user.global_role != GlobalRole.ADMIN and user.name != username: + username = actor.name + if actor.global_role != GlobalRole.ADMIN and actor.name != username: raise error_forbidden() - private_bytes, public_bytes = await run_async(crypto.generate_rsa_key_pair_bytes, username) - await session.execute( - update(UserModel) - .where( - UserModel.name == username, - UserModel.deleted == False, - ) - .values( - ssh_private_key=private_bytes.decode(), - ssh_public_key=public_bytes.decode(), + async with get_user_model_by_name_for_update(session, username) as user: + if user is None: + return None + private_bytes, public_bytes = await run_async(crypto.generate_rsa_key_pair_bytes, username) + user.ssh_private_key = private_bytes.decode() + user.ssh_public_key = public_bytes.decode() + events.emit( + session, + "User SSH key refreshed", + actor=events.UserActor.from_user(actor), + targets=[events.Target.from_model(user)], ) - ) - await session.commit() - return await get_user_model_by_name(session=session, username=username) + await session.commit() + return user async def refresh_user_token( session: AsyncSession, - user: UserModel, + actor: UserModel, username: str, ) -> Optional[UserModel]: - if user.global_role != GlobalRole.ADMIN and user.name != username: + if actor.global_role != GlobalRole.ADMIN and actor.name != username: raise error_forbidden() - new_token = str(uuid.uuid4()) - await session.execute( - update(UserModel) - .where( - UserModel.name == username, - UserModel.deleted == False, - ) - .values( - token=DecryptedString(plaintext=new_token), - token_hash=get_token_hash(new_token), + async with get_user_model_by_name_for_update(session, username) as user: + if user is None: + return None + new_token = str(uuid.uuid4()) + user.token = DecryptedString(plaintext=new_token) + user.token_hash = get_token_hash(new_token) + events.emit( + session, + "User token refreshed", + actor=events.UserActor.from_user(actor), + targets=[events.Target.from_model(user)], ) - ) - await session.commit() - return await get_user_model_by_name(session=session, username=username) + await session.commit() + return user async def delete_users( session: AsyncSession, - user: UserModel, + actor: UserModel, usernames: List[str], ): if _ADMIN_USERNAME in usernames: - raise ServerClientError("User 'admin' cannot be deleted") - - res = await session.execute( - select(UserModel) - .where( - UserModel.name.in_(usernames), - UserModel.deleted == False, - ) - .options(load_only(UserModel.id, UserModel.name)) - ) - users = res.scalars().all() - if len(users) != len(usernames): - raise ServerClientError("Failed to delete non-existent users") - - user_ids = [u.id for u in users] - timestamp = str(int(get_current_datetime().timestamp())) - updates = [] - for u in users: - updates.append( - { - "id": u.id, - "name": f"_deleted_{timestamp}_{secrets.token_hex(8)}", - "original_name": u.name, - "deleted": True, - "active": False, - } + raise ServerClientError(f"User {_ADMIN_USERNAME!r} cannot be deleted") + + filters = [ + UserModel.name.in_(usernames), + UserModel.deleted == False, + ] + res = await session.execute(select(UserModel.id).where(*filters)) + user_ids = list(res.scalars().all()) + user_ids.sort() + + async with get_locker(get_db().dialect_name).lock_ctx(UserModel.__tablename__, user_ids): + # Refetch after lock + res = await session.execute( + select(UserModel) + .where(UserModel.id.in_(user_ids), *filters) + .order_by(UserModel.id) # take locks in order + .options(load_only(UserModel.id, UserModel.name)) + .with_for_update(key_share=True) ) - await session.execute(update(UserModel), updates) - await session.execute(delete(MemberModel).where(MemberModel.user_id.in_(user_ids))) - # Projects are not deleted automatically if owners are deleted. - await session.commit() - logger.info("Deleted users %s by user %s", usernames, user.name) + users = list(res.scalars().all()) + if len(users) != len(usernames): + raise ServerClientError("Failed to delete non-existent users") + user_ids = [u.id for u in users] + timestamp = str(int(get_current_datetime().timestamp())) + for u in users: + event_target = events.Target.from_model(u) # build target before renaming the user + u.deleted = True + u.active = False + u.original_name = u.name + u.name = f"_deleted_{timestamp}_{secrets.token_hex(8)}" + events.emit( + session, + "User deleted", + actor=events.UserActor.from_user(actor), + targets=[event_target], + ) + await session.execute(delete(MemberModel).where(MemberModel.user_id.in_(user_ids))) + # Projects are not deleted automatically if owners are deleted. + await session.commit() async def get_user_model_by_name( @@ -257,6 +278,36 @@ async def get_user_model_by_name_or_error( ) +@asynccontextmanager +async def get_user_model_by_name_for_update( + session: AsyncSession, username: str +) -> AsyncGenerator[Optional[UserModel], None]: + """ + Fetch the user from the database and lock it for update. + + **NOTE**: commit changes to the database before exiting from this context manager, + so that in-memory locks are only released after commit. + """ + + filters = [ + UserModel.name == username, + UserModel.deleted == False, + ] + res = await session.execute(select(UserModel.id).where(*filters)) + user_id = res.scalar_one_or_none() + if user_id is None: + yield None + else: + async with get_locker(get_db().dialect_name).lock_ctx(UserModel.__tablename__, [user_id]): + # Refetch after lock + res = await session.execute( + select(UserModel) + .where(UserModel.id.in_([user_id]), *filters) + .with_for_update(key_share=True) + ) + yield res.scalar_one_or_none() + + async def log_in_with_token(session: AsyncSession, token: str) -> Optional[UserModel]: token_hash = get_token_hash(token) res = await session.execute( diff --git a/src/tests/_internal/server/routers/test_projects.py b/src/tests/_internal/server/routers/test_projects.py index 8e21957f5e..826ecbc096 100644 --- a/src/tests/_internal/server/routers/test_projects.py +++ b/src/tests/_internal/server/routers/test_projects.py @@ -495,6 +495,16 @@ async def test_deletes_projects( await session.refresh(project2) assert project1.deleted assert not project2.deleted + # Validate an event is emitted + response = await client.post( + "/api/events/list", headers=get_auth_headers(user.token), json={} + ) + assert response.status_code == 200 + assert len(response.json()) == 1 + assert response.json()[0]["message"] == "Project deleted" + assert len(response.json()[0]["targets"]) == 1 + assert response.json()[0]["targets"][0]["id"] == str(project1.id) + assert response.json()[0]["targets"][0]["name"] == project_name @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) diff --git a/src/tests/_internal/server/routers/test_users.py b/src/tests/_internal/server/routers/test_users.py index 8b8c7ca2a6..6c5b373a63 100644 --- a/src/tests/_internal/server/routers/test_users.py +++ b/src/tests/_internal/server/routers/test_users.py @@ -392,9 +392,22 @@ async def test_deletes_users( json={"users": [user.name]}, ) assert response.status_code == 200 + + # Validate the user is deleted res = await session.execute(select(UserModel).where(UserModel.name == user.name)) assert len(res.scalars().all()) == 0 + # Validate an event is emitted + response = await client.post( + "/api/events/list", headers=get_auth_headers(admin.token), json={} + ) + assert response.status_code == 200 + assert len(response.json()) == 1 + assert response.json()[0]["message"] == "User deleted" + assert len(response.json()[0]["targets"]) == 1 + assert response.json()[0]["targets"][0]["id"] == str(user.id) + assert response.json()[0]["targets"][0]["name"] == user.name + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_returns_400_if_users_not_exist( From a36f34c78f2294c0781ec27105c759e1117ab252 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 18 Dec 2025 08:12:12 +0000 Subject: [PATCH 004/187] Implement shim auto-update (#3395) shim binary is replaced at any time, but restart is postponed until all tasks are terminated, as safe restart with running tasks requires additional work (see _get_restart_safe_task_statuses() comment). Closes: https://github.com/dstackai/dstack/issues/3288 --- runner/cmd/shim/main.go | 27 +- runner/consts/consts.go | 3 + runner/docs/shim.openapi.yaml | 51 ++- runner/internal/shim/api/handlers.go | 57 ++- runner/internal/shim/api/handlers_test.go | 4 +- runner/internal/shim/api/schemas.go | 4 + runner/internal/shim/api/server.go | 36 +- runner/internal/shim/components/runner.go | 41 +- runner/internal/shim/components/shim.go | 61 +++ runner/internal/shim/components/types.go | 12 +- runner/internal/shim/components/utils.go | 29 ++ runner/internal/shim/models.go | 7 +- .../_internal/core/backends/base/compute.py | 82 +++- .../background/tasks/process_instances.py | 179 ++++++-- src/dstack/_internal/server/schemas/runner.py | 7 +- .../server/services/gateways/__init__.py | 2 +- .../server/services/runner/client.py | 158 +++++-- .../_internal/server/utils/provisioning.py | 15 +- src/dstack/_internal/settings.py | 6 + .../core/backends/base/test_compute.py | 7 +- .../tasks/test_process_instances.py | 423 ++++++++++++++---- .../server/services/runner/test_client.py | 91 +++- 22 files changed, 1043 insertions(+), 259 deletions(-) create mode 100644 runner/internal/shim/components/shim.go diff --git a/runner/cmd/shim/main.go b/runner/cmd/shim/main.go index af468a6a93..79aefbda6a 100644 --- a/runner/cmd/shim/main.go +++ b/runner/cmd/shim/main.go @@ -40,6 +40,11 @@ func mainInner() int { log.DefaultEntry.Logger.SetLevel(logrus.Level(defaultLogLevel)) log.DefaultEntry.Logger.SetOutput(os.Stderr) + shimBinaryPath, err := os.Executable() + if err != nil { + shimBinaryPath = consts.ShimBinaryPath + } + cmd := &cli.Command{ Name: "dstack-shim", Usage: "Starts dstack-runner or docker container.", @@ -54,6 +59,14 @@ func mainInner() int { DefaultText: path.Join("~", consts.DstackDirPath), Sources: cli.EnvVars("DSTACK_SHIM_HOME"), }, + &cli.StringFlag{ + Name: "shim-binary-path", + Usage: "Path to shim's binary", + Value: shimBinaryPath, + Destination: &args.Shim.BinaryPath, + TakesFile: true, + Sources: cli.EnvVars("DSTACK_SHIM_BINARY_PATH"), + }, &cli.IntFlag{ Name: "shim-http-port", Usage: "Set shim's http port", @@ -172,6 +185,7 @@ func mainInner() int { func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error) { log.DefaultEntry.Logger.SetLevel(logrus.Level(args.Shim.LogLevel)) + log.Info(ctx, "Starting dstack-shim", "version", Version) shimHomeDir := args.Shim.HomeDir if shimHomeDir == "" { @@ -211,6 +225,10 @@ func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error) } else if runnerErr != nil { return runnerErr } + shimManager, shimErr := components.NewShimManager(ctx, args.Shim.BinaryPath) + if shimErr != nil { + return shimErr + } log.Debug(ctx, "Shim", "args", args.Shim) log.Debug(ctx, "Runner", "args", args.Runner) @@ -259,7 +277,11 @@ func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error) } address := fmt.Sprintf("localhost:%d", args.Shim.HTTPPort) - shimServer := api.NewShimServer(ctx, address, Version, dockerRunner, dcgmExporter, dcgmWrapper, runnerManager) + shimServer := api.NewShimServer( + ctx, address, Version, + dockerRunner, dcgmExporter, dcgmWrapper, + runnerManager, shimManager, + ) if serviceMode { if err := shim.WriteHostInfo(shimHomeDir, dockerRunner.Resources(ctx)); err != nil { @@ -278,6 +300,7 @@ func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error) if err := shimServer.Serve(); err != nil { serveErrCh <- err } + close(serveErrCh) }() select { @@ -287,7 +310,7 @@ func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error) shutdownCtx, cancelShutdown := context.WithTimeout(ctx, 5*time.Second) defer cancelShutdown() - shutdownErr := shimServer.Shutdown(shutdownCtx) + shutdownErr := shimServer.Shutdown(shutdownCtx, false) if serveErr != nil { return serveErr } diff --git a/runner/consts/consts.go b/runner/consts/consts.go index aa0b8d056f..2c392b5ee4 100644 --- a/runner/consts/consts.go +++ b/runner/consts/consts.go @@ -13,6 +13,9 @@ const ( // 2. A default path on the host unless overridden via shim CLI const RunnerBinaryPath = "/usr/local/bin/dstack-runner" +// A fallback path on the host used if os.Executable() has failed +const ShimBinaryPath = "/usr/local/bin/dstack-shim" + // Error-containing messages will be identified by this signature const ExecutorFailedSignature = "Executor failed" diff --git a/runner/docs/shim.openapi.yaml b/runner/docs/shim.openapi.yaml index e6f49fa079..e375e4e9d3 100644 --- a/runner/docs/shim.openapi.yaml +++ b/runner/docs/shim.openapi.yaml @@ -2,7 +2,7 @@ openapi: 3.1.2 info: title: dstack-shim API - version: v2/0.19.41 + version: v2/0.20.1 x-logo: url: https://avatars.githubusercontent.com/u/54146142?s=260 description: > @@ -41,7 +41,7 @@ paths: **Important**: Since this endpoint is used for negotiation, it should always stay backward/future compatible, specifically the `version` field - + tags: [shim] responses: "200": description: "" @@ -50,6 +50,29 @@ paths: schema: $ref: "#/components/schemas/HealthcheckResponse" + /shutdown: + post: + summary: Request shim shutdown + description: | + (since [0.20.1](https://github.com/dstackai/dstack/releases/tag/0.20.1)) Request shim to shut down itself. + Restart must be handled by an external process supervisor, e.g., `systemd`. + + **Note**: background jobs (e.g., component installation) are canceled regardless of the `force` option. + tags: [shim] + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/ShutdownRequest" + responses: + "200": + description: Request accepted + $ref: "#/components/responses/PlainTextOk" + "400": + description: Malformed JSON body or validation error + $ref: "#/components/responses/PlainTextBadRequest" + /instance/health: get: summary: Get instance health @@ -66,7 +89,7 @@ paths: /components: get: summary: Get components - description: (since [0.19.41](https://github.com/dstackai/dstack/releases/tag/0.19.41)) Returns a list of software components (e.g., `dstack-runner`) + description: (since [0.20.0](https://github.com/dstackai/dstack/releases/tag/0.20.0)) Returns a list of software components (e.g., `dstack-runner`) tags: [Components] responses: "200": @@ -80,7 +103,7 @@ paths: post: summary: Install component description: > - (since [0.19.41](https://github.com/dstackai/dstack/releases/tag/0.19.41)) Request installing/updating the software component. + (since [0.20.0](https://github.com/dstackai/dstack/releases/tag/0.20.0)) Request installing/updating the software component. Components are installed asynchronously tags: [Components] requestBody: @@ -410,6 +433,10 @@ components: type: string enum: - dstack-runner + - dstack-shim + description: | + * (since [0.20.0](https://github.com/dstackai/dstack/releases/tag/0.20.0)) `dstack-runner` + * (since [0.20.1](https://github.com/dstackai/dstack/releases/tag/0.20.1)) `dstack-shim` ComponentStatus: title: shim.components.ComponentStatus @@ -430,7 +457,7 @@ components: type: string description: An empty string if status != installed examples: - - 0.19.41 + - 0.20.1 status: allOf: - $ref: "#/components/schemas/ComponentStatus" @@ -457,6 +484,18 @@ components: - version additionalProperties: false + ShutdownRequest: + title: shim.api.ShutdownRequest + type: object + properties: + force: + type: boolean + examples: + - false + description: If `true`, don't wait for background job coroutines to complete after canceling them and close HTTP server forcefully. + required: + - force + InstanceHealthResponse: title: shim.api.InstanceHealthResponse type: object @@ -486,7 +525,7 @@ components: url: type: string examples: - - https://dstack-runner-downloads.s3.eu-west-1.amazonaws.com/0.19.41/binaries/dstack-runner-linux-amd64 + - https://dstack-runner-downloads.s3.eu-west-1.amazonaws.com/0.20.1/binaries/dstack-runner-linux-amd64 required: - name - url diff --git a/runner/internal/shim/api/handlers.go b/runner/internal/shim/api/handlers.go index 7e4f172272..dc1be824cb 100644 --- a/runner/internal/shim/api/handlers.go +++ b/runner/internal/shim/api/handlers.go @@ -22,6 +22,21 @@ func (s *ShimServer) HealthcheckHandler(w http.ResponseWriter, r *http.Request) }, nil } +func (s *ShimServer) ShutdownHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { + var req ShutdownRequest + if err := api.DecodeJSONBody(w, r, &req, true); err != nil { + return nil, err + } + + go func() { + if err := s.Shutdown(s.ctx, req.Force); err != nil { + log.Error(s.ctx, "Shutdown", "err", err) + } + }() + + return nil, nil +} + func (s *ShimServer) InstanceHealthHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { ctx := r.Context() response := InstanceHealthResponse{} @@ -159,9 +174,11 @@ func (s *ShimServer) TaskMetricsHandler(w http.ResponseWriter, r *http.Request) } func (s *ShimServer) ComponentListHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { - runnerStatus := s.runnerManager.GetInfo(r.Context()) response := &ComponentListResponse{ - Components: []components.ComponentInfo{runnerStatus}, + Components: []components.ComponentInfo{ + s.runnerManager.GetInfo(r.Context()), + s.shimManager.GetInfo(r.Context()), + }, } return response, nil } @@ -176,27 +193,31 @@ func (s *ShimServer) ComponentInstallHandler(w http.ResponseWriter, r *http.Requ return nil, &api.Error{Status: http.StatusBadRequest, Msg: "empty name"} } + var componentManager components.ComponentManager switch components.ComponentName(req.Name) { case components.ComponentNameRunner: - if req.URL == "" { - return nil, &api.Error{Status: http.StatusBadRequest, Msg: "empty url"} - } - - // There is still a small chance of time-of-check race condition, but we ignore it. - runnerInfo := s.runnerManager.GetInfo(r.Context()) - if runnerInfo.Status == components.ComponentStatusInstalling { - return nil, &api.Error{Status: http.StatusConflict, Msg: "already installing"} - } - - s.bgJobsGroup.Go(func() { - if err := s.runnerManager.Install(s.bgJobsCtx, req.URL, true); err != nil { - log.Error(s.bgJobsCtx, "runner background install", "err", err) - } - }) - + componentManager = s.runnerManager + case components.ComponentNameShim: + componentManager = s.shimManager default: return nil, &api.Error{Status: http.StatusBadRequest, Msg: "unknown component"} } + if req.URL == "" { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "empty url"} + } + + // There is still a small chance of time-of-check race condition, but we ignore it. + componentInfo := componentManager.GetInfo(r.Context()) + if componentInfo.Status == components.ComponentStatusInstalling { + return nil, &api.Error{Status: http.StatusConflict, Msg: "already installing"} + } + + s.bgJobsGroup.Go(func() { + if err := componentManager.Install(s.bgJobsCtx, req.URL, true); err != nil { + log.Error(s.bgJobsCtx, "component background install", "name", componentInfo.Name, "err", err) + } + }) + return nil, nil } diff --git a/runner/internal/shim/api/handlers_test.go b/runner/internal/shim/api/handlers_test.go index c04621eb0a..9bc829a94c 100644 --- a/runner/internal/shim/api/handlers_test.go +++ b/runner/internal/shim/api/handlers_test.go @@ -13,7 +13,7 @@ func TestHealthcheck(t *testing.T) { request := httptest.NewRequest("GET", "/api/healthcheck", nil) responseRecorder := httptest.NewRecorder() - server := NewShimServer(context.Background(), ":12345", "0.0.1.dev2", NewDummyRunner(), nil, nil, nil) + server := NewShimServer(context.Background(), ":12345", "0.0.1.dev2", NewDummyRunner(), nil, nil, nil, nil) f := common.JSONResponseHandler(server.HealthcheckHandler) f(responseRecorder, request) @@ -30,7 +30,7 @@ func TestHealthcheck(t *testing.T) { } func TestTaskSubmit(t *testing.T) { - server := NewShimServer(context.Background(), ":12340", "0.0.1.dev2", NewDummyRunner(), nil, nil, nil) + server := NewShimServer(context.Background(), ":12340", "0.0.1.dev2", NewDummyRunner(), nil, nil, nil, nil) requestBody := `{ "id": "dummy-id", "name": "dummy-name", diff --git a/runner/internal/shim/api/schemas.go b/runner/internal/shim/api/schemas.go index a7d5fa7d48..cd0db6a202 100644 --- a/runner/internal/shim/api/schemas.go +++ b/runner/internal/shim/api/schemas.go @@ -11,6 +11,10 @@ type HealthcheckResponse struct { Version string `json:"version"` } +type ShutdownRequest struct { + Force bool `json:"force"` +} + type InstanceHealthResponse struct { DCGM *dcgm.Health `json:"dcgm"` } diff --git a/runner/internal/shim/api/server.go b/runner/internal/shim/api/server.go index 15e0191354..0482db7945 100644 --- a/runner/internal/shim/api/server.go +++ b/runner/internal/shim/api/server.go @@ -9,6 +9,7 @@ import ( "sync" "github.com/dstackai/dstack/runner/internal/api" + "github.com/dstackai/dstack/runner/internal/log" "github.com/dstackai/dstack/runner/internal/shim" "github.com/dstackai/dstack/runner/internal/shim/components" "github.com/dstackai/dstack/runner/internal/shim/dcgm" @@ -26,8 +27,11 @@ type TaskRunner interface { } type ShimServer struct { - httpServer *http.Server - mu sync.RWMutex + httpServer *http.Server + mu sync.RWMutex + ctx context.Context + inShutdown bool + inForceShutdown bool bgJobsCtx context.Context bgJobsCancel context.CancelFunc @@ -38,7 +42,8 @@ type ShimServer struct { dcgmExporter *dcgm.DCGMExporter dcgmWrapper dcgm.DCGMWrapperInterface // interface with nil value normalized to plain nil - runnerManager *components.RunnerManager + runnerManager components.ComponentManager + shimManager components.ComponentManager version string } @@ -46,7 +51,7 @@ type ShimServer struct { func NewShimServer( ctx context.Context, address string, version string, runner TaskRunner, dcgmExporter *dcgm.DCGMExporter, dcgmWrapper dcgm.DCGMWrapperInterface, - runnerManager *components.RunnerManager, + runnerManager components.ComponentManager, shimManager components.ComponentManager, ) *ShimServer { bgJobsCtx, bgJobsCancel := context.WithCancel(ctx) if dcgmWrapper != nil && reflect.ValueOf(dcgmWrapper).IsNil() { @@ -59,6 +64,7 @@ func NewShimServer( Handler: r, BaseContext: func(l net.Listener) context.Context { return ctx }, }, + ctx: ctx, bgJobsCtx: bgJobsCtx, bgJobsCancel: bgJobsCancel, @@ -70,12 +76,14 @@ func NewShimServer( dcgmWrapper: dcgmWrapper, runnerManager: runnerManager, + shimManager: shimManager, version: version, } // The healthcheck endpoint should stay backward compatible, as it is used for negotiation r.AddHandler("GET", "/api/healthcheck", s.HealthcheckHandler) + r.AddHandler("POST", "/api/shutdown", s.ShutdownHandler) r.AddHandler("GET", "/api/instance/health", s.InstanceHealthHandler) r.AddHandler("GET", "/api/components", s.ComponentListHandler) r.AddHandler("POST", "/api/components/install", s.ComponentInstallHandler) @@ -96,8 +104,26 @@ func (s *ShimServer) Serve() error { return nil } -func (s *ShimServer) Shutdown(ctx context.Context) error { +func (s *ShimServer) Shutdown(ctx context.Context, force bool) error { + s.mu.Lock() + + if s.inForceShutdown || s.inShutdown && !force { + log.Info(ctx, "Already shutting down, ignoring request") + s.mu.Unlock() + return nil + } + + s.inShutdown = true + if force { + s.inForceShutdown = true + } + s.mu.Unlock() + + log.Info(ctx, "Shutting down", "force", force) s.bgJobsCancel() + if force { + return s.httpServer.Close() + } err := s.httpServer.Shutdown(ctx) s.bgJobsGroup.Wait() return err diff --git a/runner/internal/shim/components/runner.go b/runner/internal/shim/components/runner.go index b18f51d3c3..3dc361a251 100644 --- a/runner/internal/shim/components/runner.go +++ b/runner/internal/shim/components/runner.go @@ -2,13 +2,8 @@ package components import ( "context" - "errors" "fmt" - "os/exec" - "strings" "sync" - - "github.com/dstackai/dstack/runner/internal/common" ) type RunnerManager struct { @@ -42,7 +37,7 @@ func (m *RunnerManager) Install(ctx context.Context, url string, force bool) err m.mu.Lock() if m.status == ComponentStatusInstalling { m.mu.Unlock() - return errors.New("install runner: already installing") + return fmt.Errorf("install %s: already installing", ComponentNameRunner) } m.status = ComponentStatusInstalling m.version = "" @@ -57,38 +52,10 @@ func (m *RunnerManager) Install(ctx context.Context, url string, force bool) err return checkErr } -func (m *RunnerManager) check(ctx context.Context) error { +func (m *RunnerManager) check(ctx context.Context) (err error) { m.mu.Lock() defer m.mu.Unlock() - exists, err := common.PathExists(m.path) - if err != nil { - m.status = ComponentStatusError - m.version = "" - return fmt.Errorf("check runner: %w", err) - } - if !exists { - m.status = ComponentStatusNotInstalled - m.version = "" - return nil - } - - cmd := exec.CommandContext(ctx, m.path, "--version") - output, err := cmd.Output() - if err != nil { - m.status = ComponentStatusError - m.version = "" - return fmt.Errorf("check runner: %w", err) - } - - rawVersion := string(output) // dstack-runner version 0.19.38 - versionFields := strings.Fields(rawVersion) - if len(versionFields) != 3 { - m.status = ComponentStatusError - m.version = "" - return fmt.Errorf("check runner: unexpected version output: %s", rawVersion) - } - m.status = ComponentStatusInstalled - m.version = versionFields[2] - return nil + m.status, m.version, err = checkDstackComponent(ctx, ComponentNameRunner, m.path) + return err } diff --git a/runner/internal/shim/components/shim.go b/runner/internal/shim/components/shim.go new file mode 100644 index 0000000000..5ac9b08d39 --- /dev/null +++ b/runner/internal/shim/components/shim.go @@ -0,0 +1,61 @@ +package components + +import ( + "context" + "fmt" + "sync" +) + +type ShimManager struct { + path string + version string + status ComponentStatus + + mu *sync.RWMutex +} + +func NewShimManager(ctx context.Context, pth string) (*ShimManager, error) { + m := ShimManager{ + path: pth, + mu: &sync.RWMutex{}, + } + err := m.check(ctx) + return &m, err +} + +func (m *ShimManager) GetInfo(ctx context.Context) ComponentInfo { + m.mu.RLock() + defer m.mu.RUnlock() + return ComponentInfo{ + Name: ComponentNameShim, + Version: m.version, + Status: m.status, + } +} + +func (m *ShimManager) Install(ctx context.Context, url string, force bool) error { + m.mu.Lock() + if m.status == ComponentStatusInstalling { + m.mu.Unlock() + return fmt.Errorf("install %s: already installing", ComponentNameShim) + } + m.status = ComponentStatusInstalling + m.version = "" + m.mu.Unlock() + + downloadErr := downloadFile(ctx, url, m.path, 0o755, force) + // Recheck the binary even if the download has failed, just in case. + checkErr := m.check(ctx) + if downloadErr != nil { + return downloadErr + } + return checkErr +} + +func (m *ShimManager) check(ctx context.Context) (err error) { + m.mu.Lock() + defer m.mu.Unlock() + + m.status, m.version, err = checkDstackComponent(ctx, ComponentNameShim, m.path) + return err +} diff --git a/runner/internal/shim/components/types.go b/runner/internal/shim/components/types.go index 13d1af857e..57c205af53 100644 --- a/runner/internal/shim/components/types.go +++ b/runner/internal/shim/components/types.go @@ -1,8 +1,13 @@ package components +import "context" + type ComponentName string -const ComponentNameRunner ComponentName = "dstack-runner" +const ( + ComponentNameRunner ComponentName = "dstack-runner" + ComponentNameShim ComponentName = "dstack-shim" +) type ComponentStatus string @@ -18,3 +23,8 @@ type ComponentInfo struct { Version string `json:"version"` Status ComponentStatus `json:"status"` } + +type ComponentManager interface { + GetInfo(ctx context.Context) ComponentInfo + Install(ctx context.Context, url string, force bool) error +} diff --git a/runner/internal/shim/components/utils.go b/runner/internal/shim/components/utils.go index 9161a64499..073832133d 100644 --- a/runner/internal/shim/components/utils.go +++ b/runner/internal/shim/components/utils.go @@ -7,9 +7,12 @@ import ( "io" "net/http" "os" + "os/exec" "path/filepath" + "strings" "time" + "github.com/dstackai/dstack/runner/internal/common" "github.com/dstackai/dstack/runner/internal/log" ) @@ -85,3 +88,29 @@ func downloadFile(ctx context.Context, url string, path string, mode os.FileMode return nil } + +func checkDstackComponent(ctx context.Context, name ComponentName, pth string) (status ComponentStatus, version string, err error) { + exists, err := common.PathExists(pth) + if err != nil { + return ComponentStatusError, "", fmt.Errorf("check %s: %w", name, err) + } + if !exists { + return ComponentStatusNotInstalled, "", nil + } + + cmd := exec.CommandContext(ctx, pth, "--version") + output, err := cmd.Output() + if err != nil { + return ComponentStatusError, "", fmt.Errorf("check %s: %w", name, err) + } + + rawVersion := string(output) // dstack-{shim,runner} version 0.19.38 + versionFields := strings.Fields(rawVersion) + if len(versionFields) != 3 { + return ComponentStatusError, "", fmt.Errorf("check %s: unexpected version output: %s", name, rawVersion) + } + if versionFields[0] != string(name) { + return ComponentStatusError, "", fmt.Errorf("check %s: unexpected component name: %s", name, versionFields[0]) + } + return ComponentStatusInstalled, versionFields[2], nil +} diff --git a/runner/internal/shim/models.go b/runner/internal/shim/models.go index b8da12670d..0a0c697eec 100644 --- a/runner/internal/shim/models.go +++ b/runner/internal/shim/models.go @@ -15,9 +15,10 @@ type DockerParameters interface { type CLIArgs struct { Shim struct { - HTTPPort int - HomeDir string - LogLevel int + HTTPPort int + HomeDir string + BinaryPath string + LogLevel int } Runner struct { diff --git a/src/dstack/_internal/core/backends/base/compute.py b/src/dstack/_internal/core/backends/base/compute.py index a0ff70c1ba..802aecb654 100644 --- a/src/dstack/_internal/core/backends/base/compute.py +++ b/src/dstack/_internal/core/backends/base/compute.py @@ -51,6 +51,7 @@ logger = get_logger(__name__) DSTACK_SHIM_BINARY_NAME = "dstack-shim" +DSTACK_SHIM_RESTART_INTERVAL_SECONDS = 3 DSTACK_RUNNER_BINARY_NAME = "dstack-runner" DEFAULT_PRIVATE_SUBNETS = ("10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16") NVIDIA_GPUS_REQUIRING_PROPRIETARY_KERNEL_MODULES = frozenset( @@ -758,13 +759,35 @@ def get_shim_commands( return commands -def get_dstack_runner_version() -> str: - if settings.DSTACK_VERSION is not None: - return settings.DSTACK_VERSION - version = os.environ.get("DSTACK_RUNNER_VERSION", None) - if version is None and settings.DSTACK_USE_LATEST_FROM_BRANCH: - version = get_latest_runner_build() - return version or "latest" +def get_dstack_runner_version() -> Optional[str]: + if version := settings.DSTACK_VERSION: + return version + if version := settings.DSTACK_RUNNER_VERSION: + return version + if version_url := settings.DSTACK_RUNNER_VERSION_URL: + return _fetch_version(version_url) + if settings.DSTACK_USE_LATEST_FROM_BRANCH: + return get_latest_runner_build() + return None + + +def get_dstack_shim_version() -> Optional[str]: + if version := settings.DSTACK_VERSION: + return version + if version := settings.DSTACK_SHIM_VERSION: + return version + if version := settings.DSTACK_RUNNER_VERSION: + logger.warning( + "DSTACK_SHIM_VERSION is not set, using DSTACK_RUNNER_VERSION." + " Future versions will not fall back to DSTACK_RUNNER_VERSION." + " Set DSTACK_SHIM_VERSION to supress this warning." + ) + return version + if version_url := settings.DSTACK_SHIM_VERSION_URL: + return _fetch_version(version_url) + if settings.DSTACK_USE_LATEST_FROM_BRANCH: + return get_latest_runner_build() + return None def normalize_arch(arch: Optional[str] = None) -> GoArchType: @@ -789,7 +812,7 @@ def normalize_arch(arch: Optional[str] = None) -> GoArchType: def get_dstack_runner_download_url( arch: Optional[str] = None, version: Optional[str] = None ) -> str: - url_template = os.environ.get("DSTACK_RUNNER_DOWNLOAD_URL") + url_template = settings.DSTACK_RUNNER_DOWNLOAD_URL if not url_template: if settings.DSTACK_VERSION is not None: bucket = "dstack-runner-downloads" @@ -800,12 +823,12 @@ def get_dstack_runner_download_url( "/{version}/binaries/dstack-runner-linux-{arch}" ) if version is None: - version = get_dstack_runner_version() - return url_template.format(version=version, arch=normalize_arch(arch).value) + version = get_dstack_runner_version() or "latest" + return _format_download_url(url_template, version, arch) -def get_dstack_shim_download_url(arch: Optional[str] = None) -> str: - url_template = os.environ.get("DSTACK_SHIM_DOWNLOAD_URL") +def get_dstack_shim_download_url(arch: Optional[str] = None, version: Optional[str] = None) -> str: + url_template = settings.DSTACK_SHIM_DOWNLOAD_URL if not url_template: if settings.DSTACK_VERSION is not None: bucket = "dstack-runner-downloads" @@ -815,8 +838,9 @@ def get_dstack_shim_download_url(arch: Optional[str] = None) -> str: f"https://{bucket}.s3.eu-west-1.amazonaws.com" "/{version}/binaries/dstack-shim-linux-{arch}" ) - version = get_dstack_runner_version() - return url_template.format(version=version, arch=normalize_arch(arch).value) + if version is None: + version = get_dstack_shim_version() or "latest" + return _format_download_url(url_template, version, arch) def get_setup_cloud_instance_commands( @@ -878,8 +902,16 @@ def get_run_shim_script( dstack_shim_binary_path = get_dstack_shim_binary_path(bin_path) privileged_flag = "--privileged" if is_privileged else "" pjrt_device_env = f"--pjrt-device={pjrt_device}" if pjrt_device else "" + # TODO: Use a proper process supervisor? return [ - f"nohup {dstack_shim_binary_path} {privileged_flag} {pjrt_device_env} &", + f""" + nohup sh -c ' + while true; do + {dstack_shim_binary_path} {privileged_flag} {pjrt_device_env} + sleep {DSTACK_SHIM_RESTART_INTERVAL_SECONDS} + done + ' & + """, ] @@ -1022,9 +1054,7 @@ def get_dstack_gateway_wheel(build: str, router: Optional[AnyRouterConfig] = Non channel = "release" if settings.DSTACK_RELEASE else "stgn" base_url = f"https://dstack-gateway-downloads.s3.amazonaws.com/{channel}" if build == "latest": - r = requests.get(f"{base_url}/latest-version", timeout=5) - r.raise_for_status() - build = r.text.strip() + build = _fetch_version(f"{base_url}/latest-version") or "latest" logger.debug("Found the latest gateway build: %s", build) wheel = f"{base_url}/dstack_gateway-{build}-py3-none-any.whl" # Build package spec with extras if router is specified @@ -1034,7 +1064,7 @@ def get_dstack_gateway_wheel(build: str, router: Optional[AnyRouterConfig] = Non def get_dstack_gateway_commands(router: Optional[AnyRouterConfig] = None) -> List[str]: - build = get_dstack_runner_version() + build = get_dstack_runner_version() or "latest" gateway_package = get_dstack_gateway_wheel(build, router) return [ "mkdir -p /home/ubuntu/dstack", @@ -1069,3 +1099,17 @@ def requires_nvidia_proprietary_kernel_modules(gpu_name: str) -> bool: instead of open kernel modules. """ return gpu_name.lower() in NVIDIA_GPUS_REQUIRING_PROPRIETARY_KERNEL_MODULES + + +def _fetch_version(url: str) -> Optional[str]: + r = requests.get(url, timeout=5) + r.raise_for_status() + version = r.text.strip() + if not version: + logger.warning("Empty version response from URL: %s", url) + return None + return version + + +def _format_download_url(template: str, version: str, arch: Optional[str]) -> str: + return template.format(version=version, arch=normalize_arch(arch).value) diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index 30ed2b1ec3..7d54171765 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -4,6 +4,7 @@ from datetime import timedelta from typing import Any, Dict, Optional, cast +import gpuhunt import requests from paramiko.pkey import PKey from paramiko.ssh_exception import PasswordRequiredException @@ -21,6 +22,8 @@ get_dstack_runner_download_url, get_dstack_runner_version, get_dstack_shim_binary_path, + get_dstack_shim_download_url, + get_dstack_shim_version, get_dstack_working_dir, get_shim_env, get_shim_pre_start_commands, @@ -65,6 +68,7 @@ ) from dstack._internal.server.schemas.instances import InstanceCheck from dstack._internal.server.schemas.runner import ( + ComponentInfo, ComponentStatus, HealthcheckResponse, InstanceHealthResponse, @@ -122,7 +126,6 @@ from dstack._internal.utils.ssh import ( pkey_from_str, ) -from dstack._internal.utils.version import parse_version MIN_PROCESSING_INTERVAL = timedelta(seconds=10) @@ -918,76 +921,170 @@ def _check_instance_inner( logger.exception(template, *args) return InstanceCheck(reachable=False, message=template % args) - _maybe_update_runner(instance, shim_client) - try: remove_dangling_tasks_from_instance(shim_client, instance) except Exception as e: logger.exception("%s: error removing dangling tasks: %s", fmt(instance), e) + # There should be no shim API calls after this function call since it can request shim restart. + _maybe_install_components(instance, shim_client) + return runner_client.healthcheck_response_to_instance_check( healthcheck_response, instance_health_response ) -def _maybe_update_runner(instance: InstanceModel, shim_client: runner_client.ShimClient) -> None: - # To auto-update to the latest runner dev build from the CI, see DSTACK_USE_LATEST_FROM_BRANCH. - expected_version_str = get_dstack_runner_version() +def _maybe_install_components( + instance: InstanceModel, shim_client: runner_client.ShimClient +) -> None: try: - expected_version = parse_version(expected_version_str) - except ValueError as e: - logger.warning("Failed to parse expected runner version: %s", e) + components = shim_client.get_components() + except requests.RequestException as e: + logger.warning("Instance %s: shim.get_components(): request error: %s", instance.name, e) return - if expected_version is None: - logger.debug("Cannot determine the expected runner version") + if components is None: + logger.debug("Instance %s: no components info", instance.name) return - try: - runner_info = shim_client.get_runner_info() - except requests.RequestException as e: - logger.warning("Instance %s: shim.get_runner_info(): request error: %s", instance.name, e) - return - if runner_info is None: + installed_shim_version: Optional[str] = None + installation_requested = False + + if (runner_info := components.runner) is not None: + installation_requested |= _maybe_install_runner(instance, shim_client, runner_info) + else: logger.debug("Instance %s: no runner info", instance.name) + + if (shim_info := components.shim) is not None: + if shim_info.status == ComponentStatus.INSTALLED: + installed_shim_version = shim_info.version + installation_requested |= _maybe_install_shim(instance, shim_client, shim_info) + else: + logger.debug("Instance %s: no shim info", instance.name) + + running_shim_version = shim_client.get_version_string() + if ( + # old shim without `dstack-shim` component and `/api/shutdown` support + installed_shim_version is None + # or the same version is already running + or installed_shim_version == running_shim_version + # or we just requested installation of at least one component + or installation_requested + # or at least one component is already being installed + or any(c.status == ComponentStatus.INSTALLING for c in components) + # or at least one shim task won't survive restart + or not shim_client.is_safe_to_restart() + ): return + if shim_client.shutdown(force=False): + logger.debug( + "Instance %s: restarting shim %s -> %s", + instance.name, + running_shim_version, + installed_shim_version, + ) + else: + logger.debug("Instance %s: cannot restart shim", instance.name) + + +def _maybe_install_runner( + instance: InstanceModel, shim_client: runner_client.ShimClient, runner_info: ComponentInfo +) -> bool: + # For developers: + # * To install the latest dev build for the current branch from the CI, + # set DSTACK_USE_LATEST_FROM_BRANCH=1. + # * To provide your own build, set DSTACK_RUNNER_VERSION_URL and DSTACK_RUNNER_DOWNLOAD_URL. + expected_version = get_dstack_runner_version() + if expected_version is None: + logger.debug("Cannot determine the expected runner version") + return False + + installed_version = runner_info.version logger.debug( - "Instance %s: runner status=%s version=%s", + "Instance %s: runner status=%s installed_version=%s", instance.name, runner_info.status.value, - runner_info.version, + installed_version or "(no version)", ) - if runner_info.status == ComponentStatus.INSTALLING: - return - if runner_info.version: - try: - current_version = parse_version(runner_info.version) - except ValueError as e: - logger.warning("Instance %s: failed to parse runner version: %s", instance.name, e) - return - - if current_version is None or current_version >= expected_version: - logger.debug("Instance %s: the latest runner version already installed", instance.name) - return + if runner_info.status == ComponentStatus.INSTALLING: + logger.debug("Instance %s: runner is already being installed", instance.name) + return False - logger.debug( - "Instance %s: updating runner %s -> %s", - instance.name, - current_version, - expected_version, - ) - else: - logger.debug("Instance %s: installing runner %s", instance.name, expected_version) + if installed_version and installed_version == expected_version: + logger.debug("Instance %s: expected runner version already installed", instance.name) + return False - job_provisioning_data = get_or_error(get_instance_provisioning_data(instance)) url = get_dstack_runner_download_url( - arch=job_provisioning_data.instance_type.resources.cpu_arch, version=expected_version_str + arch=_get_instance_cpu_arch(instance), version=expected_version + ) + logger.debug( + "Instance %s: installing runner %s -> %s from %s", + instance.name, + installed_version or "(no version)", + expected_version, + url, ) try: shim_client.install_runner(url) + return True except requests.RequestException as e: logger.warning("Instance %s: shim.install_runner(): %s", instance.name, e) + return False + + +def _maybe_install_shim( + instance: InstanceModel, shim_client: runner_client.ShimClient, shim_info: ComponentInfo +) -> bool: + # For developers: + # * To install the latest dev build for the current branch from the CI, + # set DSTACK_USE_LATEST_FROM_BRANCH=1. + # * To provide your own build, set DSTACK_SHIM_VERSION_URL and DSTACK_SHIM_DOWNLOAD_URL. + expected_version = get_dstack_shim_version() + if expected_version is None: + logger.debug("Cannot determine the expected shim version") + return False + + installed_version = shim_info.version + logger.debug( + "Instance %s: shim status=%s installed_version=%s running_version=%s", + instance.name, + shim_info.status.value, + installed_version or "(no version)", + shim_client.get_version_string(), + ) + + if shim_info.status == ComponentStatus.INSTALLING: + logger.debug("Instance %s: shim is already being installed", instance.name) + return False + + if installed_version and installed_version == expected_version: + logger.debug("Instance %s: expected shim version already installed", instance.name) + return False + + url = get_dstack_shim_download_url( + arch=_get_instance_cpu_arch(instance), version=expected_version + ) + logger.debug( + "Instance %s: installing shim %s -> %s from %s", + instance.name, + installed_version or "(no version)", + expected_version, + url, + ) + try: + shim_client.install_shim(url) + return True + except requests.RequestException as e: + logger.warning("Instance %s: shim.install_shim(): %s", instance.name, e) + return False + + +def _get_instance_cpu_arch(instance: InstanceModel) -> Optional[gpuhunt.CPUArchitecture]: + jpd = get_instance_provisioning_data(instance) + if jpd is None: + return None + return jpd.instance_type.resources.cpu_arch async def _terminate(instance: InstanceModel) -> None: diff --git a/src/dstack/_internal/server/schemas/runner.py b/src/dstack/_internal/server/schemas/runner.py index f3c3614b58..12ff6c6825 100644 --- a/src/dstack/_internal/server/schemas/runner.py +++ b/src/dstack/_internal/server/schemas/runner.py @@ -121,8 +121,13 @@ class InstanceHealthResponse(CoreModel): dcgm: Optional[DCGMHealthResponse] = None +class ShutdownRequest(CoreModel): + force: bool + + class ComponentName(str, Enum): RUNNER = "dstack-runner" + SHIM = "dstack-shim" class ComponentStatus(str, Enum): @@ -133,7 +138,7 @@ class ComponentStatus(str, Enum): class ComponentInfo(CoreModel): - name: ComponentName + name: str # Not using ComponentName enum for compatibility of newer shim with older server version: str status: ComponentStatus diff --git a/src/dstack/_internal/server/services/gateways/__init__.py b/src/dstack/_internal/server/services/gateways/__init__.py index 682feaf31b..4ab80a8331 100644 --- a/src/dstack/_internal/server/services/gateways/__init__.py +++ b/src/dstack/_internal/server/services/gateways/__init__.py @@ -412,7 +412,7 @@ async def init_gateways(session: AsyncSession): if settings.SKIP_GATEWAY_UPDATE: logger.debug("Skipping gateways update due to DSTACK_SKIP_GATEWAY_UPDATE env variable") else: - build = get_dstack_runner_version() + build = get_dstack_runner_version() or "latest" for gateway_compute, res in await gather_map_async( gateway_computes, diff --git a/src/dstack/_internal/server/services/runner/client.py b/src/dstack/_internal/server/services/runner/client.py index b270d4ea5f..c83a42b744 100644 --- a/src/dstack/_internal/server/services/runner/client.py +++ b/src/dstack/_internal/server/services/runner/client.py @@ -1,10 +1,12 @@ import uuid +from collections.abc import Generator from http import HTTPStatus from typing import BinaryIO, Dict, List, Literal, Optional, TypeVar, Union, overload import packaging.version import requests import requests.exceptions +from typing_extensions import Self from dstack._internal.core.errors import DstackError from dstack._internal.core.models.common import CoreModel, NetworkMode @@ -28,9 +30,11 @@ MetricsResponse, PullResponse, ShimVolumeInfo, + ShutdownRequest, SubmitBody, TaskInfoResponse, TaskListResponse, + TaskStatus, TaskSubmitRequest, TaskTerminateRequest, ) @@ -143,7 +147,7 @@ class ShimError(DstackError): pass -class ShimHTTPError(DstackError): +class ShimHTTPError(ShimError): """ An HTTP error wrapper for `requests.exceptions.HTTPError`. Should be used as follows: @@ -185,6 +189,47 @@ class ShimAPIVersionError(ShimError): pass +class ComponentList: + _items: dict[ComponentName, ComponentInfo] + + def __init__(self) -> None: + self._items = {} + + def __iter__(self) -> Generator[ComponentInfo, None, None]: + for component_info in self._items.values(): + yield component_info + + @classmethod + def from_response(cls, response: ComponentListResponse) -> Self: + components = cls() + for component_info in response.components: + try: + components.add(component_info) + except ValueError as e: + logger.warning("Error processing ComponentInfo: %s", e) + return components + + @property + def runner(self) -> Optional[ComponentInfo]: + return self.get(ComponentName.RUNNER) + + @property + def shim(self) -> Optional[ComponentInfo]: + return self.get(ComponentName.SHIM) + + def get(self, name: ComponentName) -> Optional[ComponentInfo]: + return self._items.get(name) + + def add(self, component_info: ComponentInfo) -> None: + try: + name = ComponentName(component_info.name) + except ValueError as e: + raise ValueError(f"Unknown component: {component_info.name}") from e + if name in self._items: + raise ValueError(f"Duplicate component: {component_info.name}") + self._items[name] = component_info + + class ShimClient: # API v2 (a.k.a. Future API) — `/api/tasks/[:id[/{terminate,remove}]]` # API v1 (a.k.a. Legacy API) — `/api/{submit,pull,stop}` @@ -194,14 +239,16 @@ class ShimClient: _INSTANCE_HEALTH_MIN_SHIM_VERSION = (0, 19, 22) # `/api/components` - _COMPONENTS_RUNNER_MIN_SHIM_VERSION = (0, 19, 41) + _COMPONENTS_MIN_SHIM_VERSION = (0, 20, 0) + + # `/api/shutdown` + _SHUTDOWN_MIN_SHIM_VERSION = (0, 20, 1) - _shim_version: Optional["_Version"] + _shim_version_string: str + _shim_version_tuple: Optional["_Version"] _api_version: int _negotiated: bool = False - _components: Optional[dict[ComponentName, ComponentInfo]] = None - def __init__( self, port: int, @@ -212,6 +259,16 @@ def __init__( # Methods shared by all API versions + def get_version_string(self) -> str: + if not self._negotiated: + self._negotiate() + return self._shim_version_string + + def get_version_tuple(self) -> Optional["_Version"]: + if not self._negotiated: + self._negotiate() + return self._shim_version_tuple + def is_api_v2_supported(self) -> bool: if not self._negotiated: self._negotiate() @@ -221,16 +278,24 @@ def is_instance_health_supported(self) -> bool: if not self._negotiated: self._negotiate() return ( - self._shim_version is None - or self._shim_version >= self._INSTANCE_HEALTH_MIN_SHIM_VERSION + self._shim_version_tuple is None + or self._shim_version_tuple >= self._INSTANCE_HEALTH_MIN_SHIM_VERSION ) - def is_runner_component_supported(self) -> bool: + def are_components_supported(self) -> bool: if not self._negotiated: self._negotiate() return ( - self._shim_version is None - or self._shim_version >= self._COMPONENTS_RUNNER_MIN_SHIM_VERSION + self._shim_version_tuple is None + or self._shim_version_tuple >= self._COMPONENTS_MIN_SHIM_VERSION + ) + + def is_shutdown_supported(self) -> bool: + if not self._negotiated: + self._negotiate() + return ( + self._shim_version_tuple is None + or self._shim_version_tuple >= self._SHUTDOWN_MIN_SHIM_VERSION ) @overload @@ -254,7 +319,7 @@ def healthcheck(self, unmask_exceptions: bool = False) -> Optional[HealthcheckRe def get_instance_health(self) -> Optional[InstanceHealthResponse]: if not self.is_instance_health_supported(): - logger.debug("instance health is not supported: %s", self._shim_version) + logger.debug("instance health is not supported: %s", self._shim_version_string) return None resp = self._request("GET", "/api/instance/health") if resp.status_code == HTTPStatus.NOT_FOUND: @@ -263,12 +328,37 @@ def get_instance_health(self) -> Optional[InstanceHealthResponse]: self._raise_for_status(resp) return self._response(InstanceHealthResponse, resp) - def get_runner_info(self) -> Optional[ComponentInfo]: - if not self.is_runner_component_supported(): - logger.debug("runner info is not supported: %s", self._shim_version) + def shutdown(self, *, force: bool) -> bool: + if not self.is_shutdown_supported(): + logger.debug("shim shutdown is not supported: %s", self._shim_version_string) + return False + body = ShutdownRequest(force=force) + resp = self._request("POST", "/api/shutdown", body) + # TODO: Remove this check after 0.20.1 release, use _request(..., raise_for_status=True) + if resp.status_code == HTTPStatus.NOT_FOUND and self._shim_version_tuple is None: + # Old dev build of shim + logger.debug("shim shutdown is not supported: %s", self._shim_version_string) + return False + self._raise_for_status(resp) + return True + + def is_safe_to_restart(self) -> bool: + if not self.is_api_v2_supported(): + # old shim, `/api/shutdown` is not supported anyway + return False + task_list = self.list_tasks() + if (tasks := task_list.tasks) is None: + # old shim, `/api/shutdown` is not supported anyway + return False + restart_safe_task_statuses = self._get_restart_safe_task_statuses() + return all(t.status in restart_safe_task_statuses for t in tasks) + + def get_components(self) -> Optional[ComponentList]: + if not self.are_components_supported(): + logger.debug("components are not supported: %s", self._shim_version_string) return None - components = self._get_components() - return components.get(ComponentName.RUNNER) + resp = self._request("GET", "/api/components", raise_for_status=True) + return ComponentList.from_response(self._response(ComponentListResponse, resp)) def install_runner(self, url: str) -> None: body = ComponentInstallRequest( @@ -277,6 +367,13 @@ def install_runner(self, url: str) -> None: ) self._request("POST", "/api/components/install", body, raise_for_status=True) + def install_shim(self, url: str) -> None: + body = ComponentInstallRequest( + name=ComponentName.SHIM, + url=url, + ) + self._request("POST", "/api/components/install", body, raise_for_status=True) + def list_tasks(self) -> TaskListResponse: if not self.is_api_v2_supported(): raise ShimAPIVersionError() @@ -459,30 +556,23 @@ def _raise_for_status(self, response: requests.Response) -> None: def _negotiate(self, healthcheck_response: Optional[requests.Response] = None) -> None: if healthcheck_response is None: healthcheck_response = self._request("GET", "/api/healthcheck", raise_for_status=True) - raw_version = self._response(HealthcheckResponse, healthcheck_response).version - version = _parse_version(raw_version) - if version is None or version >= self._API_V2_MIN_SHIM_VERSION: + version_string = self._response(HealthcheckResponse, healthcheck_response).version + version_tuple = _parse_version(version_string) + if version_tuple is None or version_tuple >= self._API_V2_MIN_SHIM_VERSION: api_version = 2 else: api_version = 1 - logger.debug( - "shim version: %s %s (API v%s)", - raw_version, - version or "(latest)", - api_version, - ) - self._shim_version = version + self._shim_version_string = version_string + self._shim_version_tuple = version_tuple self._api_version = api_version self._negotiated = True - def _get_components(self) -> dict[ComponentName, ComponentInfo]: - resp = self._request("GET", "/api/components") - # TODO: Remove this check after 0.19.41 release, use _request(..., raise_for_status=True) - if resp.status_code == HTTPStatus.NOT_FOUND and self._shim_version is None: - # Old dev build of shim - return {} - resp.raise_for_status() - return {c.name: c for c in self._response(ComponentListResponse, resp).components} + def _get_restart_safe_task_statuses(self) -> list[TaskStatus]: + # TODO: Rework shim's DockerRunner.Run() so that it does not wait for container termination + # (this at least requires replacing .waitContainer() with periodic polling of container + # statuses and moving some cleanup defer calls to .Terminate() and/or .Remove()) and add + # TaskStatus.RUNNING to the list of restart-safe task statuses for supported shim versions. + return [TaskStatus.TERMINATED] def healthcheck_response_to_instance_check( diff --git a/src/dstack/_internal/server/utils/provisioning.py b/src/dstack/_internal/server/utils/provisioning.py index 632dce777a..fcbe3bf086 100644 --- a/src/dstack/_internal/server/utils/provisioning.py +++ b/src/dstack/_internal/server/utils/provisioning.py @@ -8,7 +8,11 @@ import paramiko from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib -from dstack._internal.core.backends.base.compute import GoArchType, normalize_arch +from dstack._internal.core.backends.base.compute import ( + DSTACK_SHIM_RESTART_INTERVAL_SECONDS, + GoArchType, + normalize_arch, +) from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT # FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute @@ -116,16 +120,23 @@ def run_pre_start_commands( def run_shim_as_systemd_service( client: paramiko.SSHClient, binary_path: str, working_dir: str, dev: bool ) -> None: + # Stop restart attempts after ≈ 1 hour + start_limit_interval_seconds = 3600 + start_limit_burst = int( + start_limit_interval_seconds / DSTACK_SHIM_RESTART_INTERVAL_SECONDS * 0.9 + ) shim_service = dedent(f"""\ [Unit] Description=dstack-shim After=network-online.target + StartLimitIntervalSec={start_limit_interval_seconds} + StartLimitBurst={start_limit_burst} [Service] Type=simple User=root Restart=always - RestartSec=10 + RestartSec={DSTACK_SHIM_RESTART_INTERVAL_SECONDS} WorkingDirectory={working_dir} EnvironmentFile={working_dir}/{DSTACK_SHIM_ENV_FILE} ExecStart={binary_path} diff --git a/src/dstack/_internal/settings.py b/src/dstack/_internal/settings.py index 245681411d..81682480a2 100644 --- a/src/dstack/_internal/settings.py +++ b/src/dstack/_internal/settings.py @@ -10,6 +10,12 @@ # TODO: update the code to treat 0.0.0 as dev version. DSTACK_VERSION = None DSTACK_RELEASE = os.getenv("DSTACK_RELEASE") is not None or version.__is_release__ +DSTACK_RUNNER_VERSION = os.getenv("DSTACK_RUNNER_VERSION") +DSTACK_RUNNER_VERSION_URL = os.getenv("DSTACK_RUNNER_VERSION_URL") +DSTACK_RUNNER_DOWNLOAD_URL = os.getenv("DSTACK_RUNNER_DOWNLOAD_URL") +DSTACK_SHIM_VERSION = os.getenv("DSTACK_SHIM_VERSION") +DSTACK_SHIM_VERSION_URL = os.getenv("DSTACK_SHIM_VERSION_URL") +DSTACK_SHIM_DOWNLOAD_URL = os.getenv("DSTACK_SHIM_DOWNLOAD_URL") DSTACK_USE_LATEST_FROM_BRANCH = os.getenv("DSTACK_USE_LATEST_FROM_BRANCH") is not None diff --git a/src/tests/_internal/core/backends/base/test_compute.py b/src/tests/_internal/core/backends/base/test_compute.py index 848aea822c..7892a3f0f5 100644 --- a/src/tests/_internal/core/backends/base/test_compute.py +++ b/src/tests/_internal/core/backends/base/test_compute.py @@ -1,6 +1,7 @@ import re from typing import Optional +import gpuhunt import pytest from dstack._internal.core.backends.base.compute import ( @@ -62,11 +63,13 @@ def test_validates_project_name(self): class TestNormalizeArch: - @pytest.mark.parametrize("arch", [None, "", "X86", "x86_64", "AMD64"]) + @pytest.mark.parametrize( + "arch", [None, "", "X86", "x86_64", "AMD64", gpuhunt.CPUArchitecture.X86] + ) def test_amd64(self, arch: Optional[str]): assert normalize_arch(arch) is GoArchType.AMD64 - @pytest.mark.parametrize("arch", ["arm", "ARM64", "AArch64"]) + @pytest.mark.parametrize("arch", ["arm", "ARM64", "AArch64", gpuhunt.CPUArchitecture.ARM]) def test_arm64(self, arch: str): assert normalize_arch(arch) is GoArchType.ARM64 diff --git a/src/tests/_internal/server/background/tasks/test_process_instances.py b/src/tests/_internal/server/background/tasks/test_process_instances.py index e7c44ab434..cb5028c42b 100644 --- a/src/tests/_internal/server/background/tasks/test_process_instances.py +++ b/src/tests/_internal/server/background/tasks/test_process_instances.py @@ -8,6 +8,7 @@ import gpuhunt import pytest +import pytest_asyncio from freezegun import freeze_time from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -41,7 +42,11 @@ delete_instance_health_checks, process_instances, ) -from dstack._internal.server.models import InstanceHealthCheckModel, PlacementGroupModel +from dstack._internal.server.models import ( + InstanceHealthCheckModel, + InstanceModel, + PlacementGroupModel, +) from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse, DCGMHealthResult from dstack._internal.server.schemas.instances import InstanceCheck from dstack._internal.server.schemas.runner import ( @@ -54,7 +59,7 @@ TaskListResponse, TaskStatus, ) -from dstack._internal.server.services.runner.client import ShimClient +from dstack._internal.server.services.runner.client import ComponentList, ShimClient from dstack._internal.server.testing.common import ( ComputeMockSpec, create_fleet, @@ -390,14 +395,14 @@ async def test_check_shim_check_instance_health(self, test_db, session: AsyncSes assert health_check.response == health_response.json() +@pytest.mark.usefixtures("disable_maybe_install_components") class TestRemoveDanglingTasks: - @pytest.fixture(autouse=True) - def disable_runner_update_check(self) -> Generator[None, None, None]: - with patch( - "dstack._internal.server.background.tasks.process_instances.get_dstack_runner_version" - ) as get_dstack_runner_version_mock: - get_dstack_runner_version_mock.return_value = "latest" - yield + @pytest.fixture + def disable_maybe_install_components(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "dstack._internal.server.background.tasks.process_instances._maybe_install_components", + Mock(return_value=None), + ) @pytest.fixture def ssh_tunnel_mock(self) -> Generator[Mock, None, None]: @@ -1163,33 +1168,71 @@ async def test_deletes_instance_health_checks( @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) -@pytest.mark.usefixtures( - "test_db", "ssh_tunnel_mock", "shim_client_mock", "get_dstack_runner_version_mock" -) -class TestMaybeUpdateRunner: +@pytest.mark.usefixtures("test_db", "instance", "ssh_tunnel_mock", "shim_client_mock") +class BaseTestMaybeInstallComponents: + EXPECTED_VERSION = "0.20.1" + + @pytest_asyncio.fixture + async def instance(self, session: AsyncSession) -> InstanceModel: + project = await create_project(session=session) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.BUSY + ) + return instance + + @pytest.fixture + def component_list(self) -> ComponentList: + return ComponentList() + + @pytest.fixture + def debug_task_log(self, caplog: pytest.LogCaptureFixture) -> pytest.LogCaptureFixture: + caplog.set_level( + level=logging.DEBUG, + logger="dstack._internal.server.background.tasks.process_instances", + ) + return caplog + @pytest.fixture def ssh_tunnel_mock(self, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr("dstack._internal.server.services.runner.ssh.SSHTunnel", MagicMock()) @pytest.fixture - def shim_client_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + def shim_client_mock( + self, + monkeypatch: pytest.MonkeyPatch, + component_list: ComponentList, + ) -> Mock: mock = Mock(spec_set=ShimClient) mock.healthcheck.return_value = HealthcheckResponse( - service="dstack-shim", version="0.19.40" + service="dstack-shim", version=self.EXPECTED_VERSION ) mock.get_instance_health.return_value = InstanceHealthResponse() - mock.get_runner_info.return_value = ComponentInfo( - name=ComponentName.RUNNER, version="0.19.40", status=ComponentStatus.INSTALLED - ) + mock.get_components.return_value = component_list mock.list_tasks.return_value = TaskListResponse(tasks=[]) + mock.is_safe_to_restart.return_value = False monkeypatch.setattr( "dstack._internal.server.services.runner.client.ShimClient", Mock(return_value=mock) ) return mock + +@pytest.mark.usefixtures("get_dstack_runner_version_mock") +class TestMaybeInstallRunner(BaseTestMaybeInstallComponents): + @pytest.fixture + def component_list(self) -> ComponentList: + components = ComponentList() + components.add( + ComponentInfo( + name=ComponentName.RUNNER, + version=self.EXPECTED_VERSION, + status=ComponentStatus.INSTALLED, + ), + ) + return components + @pytest.fixture def get_dstack_runner_version_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: - mock = Mock(return_value="0.19.41") + mock = Mock(return_value=self.EXPECTED_VERSION) monkeypatch.setattr( "dstack._internal.server.background.tasks.process_instances.get_dstack_runner_version", mock, @@ -1207,112 +1250,328 @@ def get_dstack_runner_download_url_mock(self, monkeypatch: pytest.MonkeyPatch) - async def test_cannot_determine_expected_version( self, - caplog: pytest.LogCaptureFixture, - session: AsyncSession, + debug_task_log: pytest.LogCaptureFixture, shim_client_mock: Mock, get_dstack_runner_version_mock: Mock, ): - caplog.set_level(logging.DEBUG) - project = await create_project(session=session) - await create_instance(session=session, project=project, status=InstanceStatus.IDLE) - get_dstack_runner_version_mock.return_value = "latest" + get_dstack_runner_version_mock.return_value = None await process_instances() - assert "Cannot determine the expected runner version" in caplog.text - shim_client_mock.get_runner_info.assert_not_called() + assert "Cannot determine the expected runner version" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() shim_client_mock.install_runner.assert_not_called() - async def test_failed_to_parse_current_version( - self, - caplog: pytest.LogCaptureFixture, - session: AsyncSession, - shim_client_mock: Mock, + async def test_expected_version_already_installed( + self, debug_task_log: pytest.LogCaptureFixture, shim_client_mock: Mock ): - caplog.set_level(logging.WARNING) - project = await create_project(session=session) - await create_instance(session=session, project=project, status=InstanceStatus.IDLE) - shim_client_mock.get_runner_info.return_value.version = "invalid" + shim_client_mock.get_components.return_value.runner.version = self.EXPECTED_VERSION await process_instances() - assert "failed to parse runner version" in caplog.text - shim_client_mock.get_runner_info.assert_called_once() + assert "expected runner version already installed" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() shim_client_mock.install_runner.assert_not_called() - @pytest.mark.parametrize("current_version", ["latest", "0.0.0", "0.19.41", "0.19.42"]) - async def test_latest_version_already_installed( + @pytest.mark.parametrize("status", [ComponentStatus.NOT_INSTALLED, ComponentStatus.ERROR]) + async def test_install_not_installed_or_error( self, - caplog: pytest.LogCaptureFixture, - session: AsyncSession, + debug_task_log: pytest.LogCaptureFixture, shim_client_mock: Mock, - current_version: str, + get_dstack_runner_download_url_mock: Mock, + status: ComponentStatus, ): - caplog.set_level(logging.DEBUG) - project = await create_project(session=session) - await create_instance(session=session, project=project, status=InstanceStatus.IDLE) - shim_client_mock.get_runner_info.return_value.version = current_version + shim_client_mock.get_components.return_value.runner.version = "" + shim_client_mock.get_components.return_value.runner.status = status await process_instances() - assert "the latest runner version already installed" in caplog.text - shim_client_mock.get_runner_info.assert_called_once() - shim_client_mock.install_runner.assert_not_called() + assert f"installing runner (no version) -> {self.EXPECTED_VERSION}" in debug_task_log.text + get_dstack_runner_download_url_mock.assert_called_once_with( + arch=None, version=self.EXPECTED_VERSION + ) + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_runner.assert_called_once_with( + get_dstack_runner_download_url_mock.return_value + ) - async def test_install_not_installed( + @pytest.mark.parametrize("installed_version", ["0.19.40", "0.21.0", "dev"]) + async def test_install_installed( self, - caplog: pytest.LogCaptureFixture, - session: AsyncSession, + debug_task_log: pytest.LogCaptureFixture, shim_client_mock: Mock, get_dstack_runner_download_url_mock: Mock, + installed_version: str, ): - caplog.set_level(logging.DEBUG) - project = await create_project(session=session) - await create_instance(session=session, project=project, status=InstanceStatus.IDLE) - shim_client_mock.get_runner_info.return_value.version = "" - shim_client_mock.get_runner_info.return_value.status = ComponentStatus.NOT_INSTALLED + shim_client_mock.get_components.return_value.runner.version = installed_version await process_instances() - assert "installing runner 0.19.41" in caplog.text - get_dstack_runner_download_url_mock.assert_called_once_with(arch=None, version="0.19.41") - shim_client_mock.get_runner_info.assert_called_once() + assert ( + f"installing runner {installed_version} -> {self.EXPECTED_VERSION}" + in debug_task_log.text + ) + get_dstack_runner_download_url_mock.assert_called_once_with( + arch=None, version=self.EXPECTED_VERSION + ) + shim_client_mock.get_components.assert_called_once() shim_client_mock.install_runner.assert_called_once_with( get_dstack_runner_download_url_mock.return_value ) - async def test_update_outdated( + async def test_already_installing( + self, debug_task_log: pytest.LogCaptureFixture, shim_client_mock: Mock + ): + shim_client_mock.get_components.return_value.runner.version = "dev" + shim_client_mock.get_components.return_value.runner.status = ComponentStatus.INSTALLING + + await process_instances() + + assert "runner is already being installed" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_runner.assert_not_called() + + +@pytest.mark.usefixtures("get_dstack_shim_version_mock") +class TestMaybeInstallShim(BaseTestMaybeInstallComponents): + @pytest.fixture + def component_list(self) -> ComponentList: + components = ComponentList() + components.add( + ComponentInfo( + name=ComponentName.SHIM, + version=self.EXPECTED_VERSION, + status=ComponentStatus.INSTALLED, + ), + ) + return components + + @pytest.fixture + def get_dstack_shim_version_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value=self.EXPECTED_VERSION) + monkeypatch.setattr( + "dstack._internal.server.background.tasks.process_instances.get_dstack_shim_version", + mock, + ) + return mock + + @pytest.fixture + def get_dstack_shim_download_url_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value="https://example.com/shim") + monkeypatch.setattr( + "dstack._internal.server.background.tasks.process_instances.get_dstack_shim_download_url", + mock, + ) + return mock + + async def test_cannot_determine_expected_version( self, - caplog: pytest.LogCaptureFixture, - session: AsyncSession, + debug_task_log: pytest.LogCaptureFixture, shim_client_mock: Mock, - get_dstack_runner_download_url_mock: Mock, + get_dstack_shim_version_mock: Mock, ): - caplog.set_level(logging.DEBUG) - project = await create_project(session=session) - await create_instance(session=session, project=project, status=InstanceStatus.IDLE) - shim_client_mock.get_runner_info.return_value.version = "0.19.38" + get_dstack_shim_version_mock.return_value = None await process_instances() - assert "updating runner 0.19.38 -> 0.19.41" in caplog.text - get_dstack_runner_download_url_mock.assert_called_once_with(arch=None, version="0.19.41") - shim_client_mock.get_runner_info.assert_called_once() - shim_client_mock.install_runner.assert_called_once_with( - get_dstack_runner_download_url_mock.return_value + assert "Cannot determine the expected shim version" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_not_called() + + async def test_expected_version_already_installed( + self, debug_task_log: pytest.LogCaptureFixture, shim_client_mock: Mock + ): + shim_client_mock.get_components.return_value.shim.version = self.EXPECTED_VERSION + + await process_instances() + + assert "expected shim version already installed" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_not_called() + + @pytest.mark.parametrize("status", [ComponentStatus.NOT_INSTALLED, ComponentStatus.ERROR]) + async def test_install_not_installed_or_error( + self, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_shim_download_url_mock: Mock, + status: ComponentStatus, + ): + shim_client_mock.get_components.return_value.shim.version = "" + shim_client_mock.get_components.return_value.shim.status = status + + await process_instances() + + assert f"installing shim (no version) -> {self.EXPECTED_VERSION}" in debug_task_log.text + get_dstack_shim_download_url_mock.assert_called_once_with( + arch=None, version=self.EXPECTED_VERSION + ) + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_called_once_with( + get_dstack_shim_download_url_mock.return_value ) - async def test_already_updating( + @pytest.mark.parametrize("installed_version", ["0.19.40", "0.21.0", "dev"]) + async def test_install_installed( self, - session: AsyncSession, + debug_task_log: pytest.LogCaptureFixture, shim_client_mock: Mock, + get_dstack_shim_download_url_mock: Mock, + installed_version: str, ): - project = await create_project(session=session) - await create_instance(session=session, project=project, status=InstanceStatus.IDLE) - shim_client_mock.get_runner_info.return_value.version = "0.19.38" - shim_client_mock.get_runner_info.return_value.status = ComponentStatus.INSTALLING + shim_client_mock.get_components.return_value.shim.version = installed_version await process_instances() - shim_client_mock.get_runner_info.assert_called_once() - shim_client_mock.install_runner.assert_not_called() + assert ( + f"installing shim {installed_version} -> {self.EXPECTED_VERSION}" + in debug_task_log.text + ) + get_dstack_shim_download_url_mock.assert_called_once_with( + arch=None, version=self.EXPECTED_VERSION + ) + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_called_once_with( + get_dstack_shim_download_url_mock.return_value + ) + + async def test_already_installing( + self, debug_task_log: pytest.LogCaptureFixture, shim_client_mock: Mock + ): + shim_client_mock.get_components.return_value.shim.version = "dev" + shim_client_mock.get_components.return_value.shim.status = ComponentStatus.INSTALLING + + await process_instances() + + assert "shim is already being installed" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_not_called() + + +@pytest.mark.usefixtures("maybe_install_runner_mock", "maybe_install_shim_mock") +class TestMaybeRestartShim(BaseTestMaybeInstallComponents): + @pytest.fixture + def component_list(self) -> ComponentList: + components = ComponentList() + components.add( + ComponentInfo( + name=ComponentName.RUNNER, + version=self.EXPECTED_VERSION, + status=ComponentStatus.INSTALLED, + ), + ) + components.add( + ComponentInfo( + name=ComponentName.SHIM, + version=self.EXPECTED_VERSION, + status=ComponentStatus.INSTALLED, + ), + ) + return components + + @pytest.fixture + def maybe_install_runner_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value=False) + monkeypatch.setattr( + "dstack._internal.server.background.tasks.process_instances._maybe_install_runner", + mock, + ) + return mock + + @pytest.fixture + def maybe_install_shim_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value=False) + monkeypatch.setattr( + "dstack._internal.server.background.tasks.process_instances._maybe_install_shim", + mock, + ) + return mock + + async def test_up_to_date(self, shim_client_mock: Mock): + shim_client_mock.get_version_string.return_value = self.EXPECTED_VERSION + shim_client_mock.is_safe_to_restart.return_value = True + + await process_instances() + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_no_shim_component_info(self, shim_client_mock: Mock): + shim_client_mock.get_components.return_value = ComponentList() + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + + await process_instances() + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_shutdown_requested(self, shim_client_mock: Mock): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + + await process_instances() + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_called_once_with(force=False) + + async def test_outdated_but_task_wont_survive_restart(self, shim_client_mock: Mock): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = False + + await process_instances() + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_but_runner_installation_in_progress( + self, shim_client_mock: Mock, component_list: ComponentList + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + runner_info = component_list.runner + assert runner_info is not None + runner_info.status = ComponentStatus.INSTALLING + + await process_instances() + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_but_shim_installation_in_progress( + self, shim_client_mock: Mock, component_list: ComponentList + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + shim_info = component_list.shim + assert shim_info is not None + shim_info.status = ComponentStatus.INSTALLING + + await process_instances() + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_but_runner_installation_requested( + self, shim_client_mock: Mock, maybe_install_runner_mock: Mock + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + maybe_install_runner_mock.return_value = True + + await process_instances() + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_but_shim_installation_requested( + self, shim_client_mock: Mock, maybe_install_shim_mock: Mock + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + maybe_install_shim_mock.return_value = True + + await process_instances() + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() diff --git a/src/tests/_internal/server/services/runner/test_client.py b/src/tests/_internal/server/services/runner/test_client.py index e68a007cff..588c231a19 100644 --- a/src/tests/_internal/server/services/runner/test_client.py +++ b/src/tests/_internal/server/services/runner/test_client.py @@ -99,7 +99,7 @@ def test( client._negotiate() - assert client._shim_version == expected_shim_version + assert client._shim_version_tuple == expected_shim_version assert client._api_version == expected_api_version assert adapter.call_count == 1 self.assert_request(adapter, 0, "GET", "/api/healthcheck") @@ -129,7 +129,7 @@ def test_healthcheck(self, client: ShimClient, adapter: requests_mock.Adapter): assert adapter.call_count == 1 self.assert_request(adapter, 0, "GET", "/api/healthcheck") # healthcheck() method also performs negotiation to save API calls - assert client._shim_version == (0, 18, 30) + assert client._shim_version_tuple == (0, 18, 30) assert client._api_version == 1 def test_submit(self, client: ShimClient, adapter: requests_mock.Adapter): @@ -262,9 +262,94 @@ def test_healthcheck(self, client: ShimClient, adapter: requests_mock.Adapter): assert adapter.call_count == 1 self.assert_request(adapter, 0, "GET", "/api/healthcheck") # healthcheck() method also performs negotiation to save API calls - assert client._shim_version == (0, 18, 40) + assert client._shim_version_tuple == (0, 18, 40) assert client._api_version == 2 + def test_is_safe_to_restart_false_old_shim( + self, client: ShimClient, adapter: requests_mock.Adapter + ): + adapter.register_uri( + "GET", + "/api/tasks", + json={ + # pre-0.19.26 shim returns ids instead of tasks + "tasks": None, + "ids": [], + }, + ) + + res = client.is_safe_to_restart() + + assert res is False + assert adapter.call_count == 2 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + self.assert_request(adapter, 1, "GET", "/api/tasks") + + @pytest.mark.parametrize( + "task_status", + [ + TaskStatus.PENDING, + TaskStatus.PREPARING, + TaskStatus.PULLING, + TaskStatus.CREATING, + TaskStatus.RUNNING, + ], + ) + def test_is_safe_to_restart_false_status_not_safe( + self, client: ShimClient, adapter: requests_mock.Adapter, task_status: TaskStatus + ): + adapter.register_uri( + "GET", + "/api/tasks", + json={ + "tasks": [ + { + "id": str(uuid.uuid4()), + "status": "terminated", + }, + { + "id": str(uuid.uuid4()), + "status": task_status.value, + }, + ], + "ids": None, + }, + ) + + res = client.is_safe_to_restart() + + assert res is False + assert adapter.call_count == 2 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + self.assert_request(adapter, 1, "GET", "/api/tasks") + + def test_is_safe_to_restart_true(self, client: ShimClient, adapter: requests_mock.Adapter): + adapter.register_uri( + "GET", + "/api/tasks", + json={ + "tasks": [ + { + "id": str(uuid.uuid4()), + "status": "terminated", + }, + { + "id": str(uuid.uuid4()), + # TODO: replace with "running" once it's safe + "status": "terminated", + }, + ], + "ids": None, + }, + ) + + res = client.is_safe_to_restart() + + assert res is True + assert adapter.call_count == 2 + self.assert_request(adapter, 0, "GET", "/api/healthcheck") + self.assert_request(adapter, 1, "GET", "/api/tasks") + def test_get_task(self, client: ShimClient, adapter: requests_mock.Adapter): task_id = "d35b6e24-b556-4d6e-81e3-5982d2c34449" url = f"/api/tasks/{task_id}" From 201952aa1fb5cdb4d6cd735864f18740b08350d3 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Thu, 18 Dec 2025 09:58:04 +0100 Subject: [PATCH 005/187] [Fleets] Updated error message and docs (#3377) --- docs/docs/guides/troubleshooting.md | 35 ++++++++++--------- .../tasks/process_submitted_jobs.py | 5 ++- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/docs/docs/guides/troubleshooting.md b/docs/docs/guides/troubleshooting.md index 44d6c98141..9ece2b4ffb 100644 --- a/docs/docs/guides/troubleshooting.md +++ b/docs/docs/guides/troubleshooting.md @@ -28,25 +28,28 @@ and [this](https://github.com/dstackai/dstack/issues/1551). ## Typical issues -### No instance offers { #no-offers } +### No offers { #no-offers } [//]: # (NOTE: This section is referenced in the CLI. Do not change its URL.) If you run `dstack apply` and don't see any instance offers, it means that `dstack` could not find instances that match the requirements in your configuration. Below are some of the reasons why this might happen. -#### Cause 1: No capacity providers +> Feel free to use `dstack offer` to view available offers. -Before you can run any workloads, you need to configure a [backend](../concepts/backends.md), -create an [SSH fleet](../concepts/fleets.md#ssh-fleets), or sign up for -[dstack Sky](https://sky.dstack.ai). -If you have configured a backend and still can't use it, check the output of `dstack server` -for backend configuration errors. +#### Cause 1: No fleets -> **Tip**: You can find a list of successfully configured backends -> on the [project settings page](../concepts/projects.md#backends) in the UI. +Make sure you've created a [fleet](../concepts/fleets.md) before submitting any runs. -#### Cause 2: Requirements mismatch +#### Cause 2: No backends + +If you are not using [SSH fleets](../concepts/fleets.md#ssh-fleets), make sure you have configured at least one [backends](../concepts/backends.md). + +If you have configured a backend but still cannot use it, check the output of `dstack server` for backend configuration errors. + +> You can find a list of successfully configured backends on the [project settings page](../concepts/projects.md#backends) in the UI. + +#### Cause 3: Requirements mismatch When you apply a configuration, `dstack` tries to find instances that match the [`resources`](../reference/dstack.yml/task.md#resources), @@ -63,7 +66,7 @@ Make sure your configuration doesn't set any conflicting requirements, such as `regions` that don't exist in the specified `backends`, or `instance_types` that don't match the specified `resources`. -#### Cause 3: Too specific resources +#### Cause 4: Too specific resources If you set a resource requirement to an exact value, `dstack` will only select instances that have exactly that amount of resources. For example, `cpu: 5` and `memory: 10GB` will only @@ -73,14 +76,14 @@ Typically, you will want to set resource ranges to match more instances. For example, `cpu: 4..8` and `memory: 10GB..` will match instances with 4 to 8 CPUs and at least 10GB of memory. -#### Cause 4: Default resources +#### Cause 5: Default resources By default, `dstack` uses these resource requirements: `cpu: 2..`, `memory: 8GB..`, `disk: 100GB..`. If you want to use smaller instances, override the `cpu`, `memory`, or `disk` properties in your configuration. -#### Cause 5: GPU requirements +#### Cause 6: GPU requirements By default, `dstack` only selects instances with no GPUs or a single NVIDIA GPU. If you want to use non-NVIDIA GPUs or multi-GPU instances, set the `gpu` property @@ -91,13 +94,13 @@ Examples: `gpu: amd` (one AMD GPU), `gpu: A10:4..8` (4 to 8 A10 GPUs), > If you don't specify the number of GPUs, `dstack` will only select single-GPU instances. -#### Cause 6: Network volumes +#### Cause 7: Network volumes If your run configuration uses [network volumes](../concepts/volumes.md#network-volumes), `dstack` will only select instances from the same backend and region as the volumes. For AWS, the availability zone of the volume and the instance should also match. -#### Cause 7: Feature support +#### Cause 8: Feature support Some `dstack` features are not supported by all backends. If your configuration uses one of these features, `dstack` will only select offers from the backends that support it. @@ -113,7 +116,7 @@ one of these features, `dstack` will only select offers from the backends that s - [Reservations](../reference/dstack.yml/fleet.md#reservation) are only supported by the `aws` and `gcp` backends. -#### Cause 8: dstack Sky balance +#### Cause 9: dstack Sky balance If you are using [dstack Sky](https://sky.dstack.ai), diff --git a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py index defa75e8b5..21a5e4bffc 100644 --- a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py @@ -349,7 +349,10 @@ async def _process_submitted_job( job_model.termination_reason = ( JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY ) - job_model.termination_reason_message = "Failed to find fleet" + job_model.termination_reason_message = ( + "No fleet found. Create it before submitting a run: " + "https://dstack.ai/docs/concepts/fleets" + ) switch_job_status(session, job_model, JobStatus.TERMINATING) job_model.last_processed_at = common_utils.get_current_datetime() await session.commit() From b85c4f0350aaad2f35524d52eecd4bbab8b59ac6 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:39:27 +0100 Subject: [PATCH 006/187] [Blog] dstack 0.20 GA: Fleet-first UX and other important changes (#3401) --- docs/blog/posts/0_20.md | 127 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 docs/blog/posts/0_20.md diff --git a/docs/blog/posts/0_20.md b/docs/blog/posts/0_20.md new file mode 100644 index 0000000000..33f7e66e88 --- /dev/null +++ b/docs/blog/posts/0_20.md @@ -0,0 +1,127 @@ +--- +title: "dstack 0.20 GA: Fleet-first UX and other important changes" +date: 2025-12-18 +description: "TBA" +slug: "0_20" +image: https://dstack.ai/static-assets/static-assets/images/dstack-0_20.png +categories: + - Changelog +links: + - Release notes: https://github.com/dstackai/dstack/releases/tag/0.20.0 + - Migration guide: https://dstack.ai/docs/guides/migration/#0_20 +--- + +# dstack 0.20 GA: Fleet-first UX and other important changes + +We’re releasing `dstack` 0.20.0, a major update that improves how teams orchestrate GPU workloads for development, training, and inference. Most `dstack` updates are incremental and backward compatible, but this version introduces a few major changes to how you work with `dstack`. + +In `dstack` 0.20.0, fleets are now a first-class concept, giving you more explicit control over how GPU capacity is provisioned and managed. We’ve also added *Events*, which record important system activity—such as scheduling decisions, run status changes, and resource lifecycle updates—so it’s easier to understand what’s happening without digging through server logs. + + + +This post goes through the changes in detail and explains how to upgrade and migrate your existing setup. + + + +## Fleets + +In earlier versions, submitting a run that didn’t match any existing fleet would cause `dstack` to automatically create one. While this reduced setup overhead, it also made capacity provisioning implicit and less predictable. + +With `dstack` 0.20.0, fleets must be created explicitly and treated as first-class resources. This shift makes capacity provisioning declarative, improving control over resource limits, instance lifecycles, and overall fleet behavior. + +For users who previously relied on auto-created fleets, similar behavior can be achieved by defining an elastic fleet, for example: + +
+ + ```yaml + type: fleet + # The name is optional, if not specified, generated randomly + name: default + + # Can be a range or a fixed number + # Allow to provision of up to 2 instances + nodes: 0..2 + + # Uncomment to ensure instances are inter-connected + #placement: cluster + + # Deprovision instances above the minimum if they remain idle + idle_duration: 1h + + resources: + # Allow to provision up to 8 GPUs + gpu: 0..8 + ``` + +
+ +If the `nodes` range starts above `0`, `dstack` provisions the initial capacity upfront and scales additional instances on demand, enabling more predictable capacity planning. + +When a run does not explicitly reference a fleet (via the [`fleets`](../../docs/reference/dstack.yml/dev-environment.md#fleets) property), `dstack` automatically selects one that satisfies the run’s requirements. + +## Events + +Previously, when `dstack` changed the state of a run or other resource, that information was written only to the server logs. This worked for admins, but it made it hard for users to understand what happened or why. + +Starting with version `0.20.0`, `dstack` exposes these events directly to users. + +Each resource now includes an `Events` tab in the UI, showing related events in real time: + + + +There is also a dedicated `Events` page that aggregates events across resources. You can filter by project, user, run, or job to quickly narrow down what you’re looking for: + + + +The same information is available through the CLI: + + + +This makes it easier to track state changes, debug issues, and review past actions without needing access to server logs. + +## Runs + +This release updates several defaults related to run configuration. The goal is to reduce implicit assumptions and make it more convenient. + +### Working directory + +Previously, the `working_dir` property defaulted to `/workflow`. Now, the default working directory is always taken from the Docker image. + +The working directory in the default Docker images (if you don't specify image) is now set to `/dstack/run`. + +### Repo directory + +Previously, if you didn't specify a repo path, the repo was cloned to `/workflow`. Now, in that case the repo will be cloned to the working directory. + +
+ +```yaml +type: dev-environment +name: vscode + +repos: + # Clones the repo from the parent directory (`examples/..`) to `` + - .. + +ide: vscode +``` + +
+ +Also, now if the repo directory is not empty, the run will fail with an error. + +## Backward compatibility + +While the update introduces breaking changes, 0.19.* CLIs remain compatible with 0.20.* servers. + +> Note, the 0.20.* CLI only works with a 0.20.* server. + +!!! warning "Breaking changes" + This release introduces breaking changes that may affect existing setups. Before upgrading either the CLI or the server, review the [migration guide](https://dstack.ai/docs/guides/migration/#0_20). + +## What's next + +1. Follow the [Installation](../../docs/installation/index.md) guide +2. Try the [Quickstart](../../docs/quickstart.md) +3. Report issues on [GitHub](https://github.com/dstackai/dstack/issues) +4. Ask questions on [Discord](https://discord.gg/u8SmfwPpMd) From b2be6a7e4db1c52adcbb4688b9e450e694ad702d Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Thu, 18 Dec 2025 21:39:51 +0100 Subject: [PATCH 007/187] [Blog] dstack 0.20 GA: Fleet-first UX and other important changes (#3401) --- docs/blog/posts/0_20.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/blog/posts/0_20.md b/docs/blog/posts/0_20.md index 33f7e66e88..02c088e3e6 100644 --- a/docs/blog/posts/0_20.md +++ b/docs/blog/posts/0_20.md @@ -39,7 +39,7 @@ For users who previously relied on auto-created fleets, similar behavior can be name: default # Can be a range or a fixed number - # Allow to provision of up to 2 instances + # Allow to provision up to 2 instances nodes: 0..2 # Uncomment to ensure instances are inter-connected @@ -87,7 +87,7 @@ This release updates several defaults related to run configuration. The goal is Previously, the `working_dir` property defaulted to `/workflow`. Now, the default working directory is always taken from the Docker image. -The working directory in the default Docker images (if you don't specify image) is now set to `/dstack/run`. +The working directory in the default Docker images (if you don't specify `image`) is now set to `/dstack/run`. ### Repo directory From 616fa312152d7152fc6a76c65c493ed2367830e6 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Mon, 22 Dec 2025 07:41:07 +0000 Subject: [PATCH 008/187] [runner] Get container cgroup path from procfs (#3402) In addition, support for cgroups v1 has been dropped, it's almost 2026 Fixes: https://github.com/dstackai/dstack/issues/3398 --- runner/cmd/runner/main.go | 2 +- runner/internal/metrics/cgroups.go | 107 ++++++++++++++++++ runner/internal/metrics/cgroups_test.go | 87 ++++++++++++++ runner/internal/metrics/metrics.go | 82 +++++--------- runner/internal/metrics/metrics_test.go | 4 +- runner/internal/runner/api/http.go | 8 +- runner/internal/runner/api/server.go | 13 ++- .../background/tasks/process_metrics.py | 8 +- 8 files changed, 245 insertions(+), 66 deletions(-) create mode 100644 runner/internal/metrics/cgroups.go create mode 100644 runner/internal/metrics/cgroups_test.go diff --git a/runner/cmd/runner/main.go b/runner/cmd/runner/main.go index fc48233c62..27c07292b9 100644 --- a/runner/cmd/runner/main.go +++ b/runner/cmd/runner/main.go @@ -38,7 +38,7 @@ func start(tempDir string, homeDir string, httpPort int, sshPort int, logLevel i log.DefaultEntry.Logger.SetOutput(io.MultiWriter(os.Stdout, defaultLogFile)) log.DefaultEntry.Logger.SetLevel(logrus.Level(logLevel)) - server, err := api.NewServer(tempDir, homeDir, fmt.Sprintf(":%d", httpPort), sshPort, version) + server, err := api.NewServer(context.TODO(), tempDir, homeDir, fmt.Sprintf(":%d", httpPort), sshPort, version) if err != nil { return fmt.Errorf("create server: %w", err) } diff --git a/runner/internal/metrics/cgroups.go b/runner/internal/metrics/cgroups.go new file mode 100644 index 0000000000..9ce1e54fe6 --- /dev/null +++ b/runner/internal/metrics/cgroups.go @@ -0,0 +1,107 @@ +package metrics + +import ( + "bufio" + "context" + "errors" + "fmt" + "os" + "strings" + + "github.com/dstackai/dstack/runner/internal/log" +) + +func getProcessCgroupMountPoint(ctx context.Context, ProcPidMountsPath string) (string, error) { + // See proc_pid_mounts(5) for the ProcPidMountsPath file description + file, err := os.Open(ProcPidMountsPath) + if err != nil { + return "", fmt.Errorf("open mounts file: %w", err) + } + defer func() { + _ = file.Close() + }() + + mountPoint := "" + hasCgroupV1 := false + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + // See fstab(5) for the format description + fields := strings.Fields(line) + if len(fields) != 6 { + log.Warning(ctx, "Unexpected number of fields in mounts file", "num", len(fields), "line", line) + continue + } + fsType := fields[2] + if fsType == "cgroup2" { + mountPoint = fields[1] + break + } + if fsType == "cgroup" { + hasCgroupV1 = true + } + } + if err := scanner.Err(); err != nil { + log.Warning(ctx, "Error while scanning mounts file", "err", err) + } + + if mountPoint != "" { + return mountPoint, nil + } + + if hasCgroupV1 { + return "", errors.New("only cgroup v1 mounts found") + } + + return "", errors.New("no cgroup mounts found") +} + +func getProcessCgroupPathname(ctx context.Context, procPidCgroupPath string) (string, error) { + // See cgroups(7) for the procPidCgroupPath file description + file, err := os.Open(procPidCgroupPath) + if err != nil { + return "", fmt.Errorf("open cgroup file: %w", err) + } + defer func() { + _ = file.Close() + }() + + pathname := "" + hasCgroupV1 := false + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + // See cgroups(7) for the format description + fields := strings.Split(line, ":") + if len(fields) != 3 { + log.Warning(ctx, "Unexpected number of fields in cgroup file", "num", len(fields), "line", line) + continue + } + if fields[0] != "0" { + hasCgroupV1 = true + continue + } + if fields[1] != "" { + // Must be empty for v2 + log.Warning(ctx, "Unexpected v2 entry in cgroup file", "num", "line", line) + continue + } + pathname = fields[2] + break + } + if err := scanner.Err(); err != nil { + log.Warning(ctx, "Error while scanning cgroup file", "err", err) + } + + if pathname != "" { + return pathname, nil + } + + if hasCgroupV1 { + return "", errors.New("only cgroup v1 pathnames found") + } + + return "", errors.New("no cgroup pathname found") +} diff --git a/runner/internal/metrics/cgroups_test.go b/runner/internal/metrics/cgroups_test.go new file mode 100644 index 0000000000..3e6e0abca7 --- /dev/null +++ b/runner/internal/metrics/cgroups_test.go @@ -0,0 +1,87 @@ +package metrics + +import ( + "fmt" + "os" + "path" + "testing" + + "github.com/stretchr/testify/require" +) + +const ( + cgroup2MountLine = "cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot 0 0" + cgroupMountLine = "cgroup /sys/fs/cgroup/cpu,cpuacct cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct 0 0" + rootMountLine = "/dev/nvme0n1p5 / ext4 rw,relatime 0 0" +) + +func TestGetProcessCgroupMountPoint_ErrorNoCgroupMounts(t *testing.T) { + procPidMountsPath := createProcFile(t, "mounts", rootMountLine, "malformed line") + + mountPoint, err := getProcessCgroupMountPoint(t.Context(), procPidMountsPath) + + require.ErrorContains(t, err, "no cgroup mounts found") + require.Equal(t, "", mountPoint) +} + +func TestGetProcessCgroupMountPoint_ErrorOnlyCgroupV1Mounts(t *testing.T) { + procPidMountsPath := createProcFile(t, "mounts", rootMountLine, cgroupMountLine) + + mountPoint, err := getProcessCgroupMountPoint(t.Context(), procPidMountsPath) + + require.ErrorContains(t, err, "only cgroup v1 mounts found") + require.Equal(t, "", mountPoint) +} + +func TestGetProcessCgroupMountPoint_OK(t *testing.T) { + procPidMountsPath := createProcFile(t, "mounts", rootMountLine, cgroupMountLine, cgroup2MountLine) + + mountPoint, err := getProcessCgroupMountPoint(t.Context(), procPidMountsPath) + + require.NoError(t, err) + require.Equal(t, "/sys/fs/cgroup", mountPoint) +} + +func TestGetProcessCgroupPathname_ErrorNoCgroup(t *testing.T) { + procPidCgroupPath := createProcFile(t, "cgroup", "malformed entry") + + mountPoint, err := getProcessCgroupPathname(t.Context(), procPidCgroupPath) + + require.ErrorContains(t, err, "no cgroup pathname found") + require.Equal(t, "", mountPoint) +} + +func TestGetProcessCgroupPathname_ErrorOnlyCgroupV1(t *testing.T) { + procPidCgroupPath := createProcFile(t, "cgroup", "7:cpu,cpuacct:/user.slice") + + pathname, err := getProcessCgroupPathname(t.Context(), procPidCgroupPath) + + require.ErrorContains(t, err, "only cgroup v1 pathnames found") + require.Equal(t, "", pathname) +} + +func TestGetProcessCgroupPathname_OK(t *testing.T) { + procPidCgroupPath := createProcFile(t, "cgroup", "7:cpu,cpuacct:/user.slice", "0::/user.slice/user-1000.slice/session-1.scope") + + mountPoint, err := getProcessCgroupPathname(t.Context(), procPidCgroupPath) + + require.NoError(t, err) + require.Equal(t, "/user.slice/user-1000.slice/session-1.scope", mountPoint) +} + +func createProcFile(t *testing.T, name string, lines ...string) string { + t.Helper() + tmpDir := t.TempDir() + pth := path.Join(tmpDir, name) + file, err := os.OpenFile(pth, os.O_WRONLY|os.O_CREATE, 0o600) + require.NoError(t, err) + defer func() { + err := file.Close() + require.NoError(t, err) + }() + for _, line := range lines { + _, err := fmt.Fprintln(file, line) + require.NoError(t, err) + } + return pth +} diff --git a/runner/internal/metrics/metrics.go b/runner/internal/metrics/metrics.go index 0a5c1a639e..26acc2cdf4 100644 --- a/runner/internal/metrics/metrics.go +++ b/runner/internal/metrics/metrics.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "os/exec" + "path" "strconv" "strings" "time" @@ -17,33 +18,42 @@ import ( ) type MetricsCollector struct { - cgroupVersion int - gpuVendor common.GpuVendor + cgroupMountPoint string + gpuVendor common.GpuVendor } -func NewMetricsCollector() (*MetricsCollector, error) { - cgroupVersion, err := getCgroupVersion() +func NewMetricsCollector(ctx context.Context) (*MetricsCollector, error) { + // It's unlikely that cgroup mount point will change during container lifetime, + // so we detect it only once and reuse. + cgroupMountPoint, err := getProcessCgroupMountPoint(ctx, "/proc/self/mounts") if err != nil { - return nil, err + return nil, fmt.Errorf("get cgroup mount point: %w", err) } gpuVendor := common.GetGpuVendor() return &MetricsCollector{ - cgroupVersion: cgroupVersion, - gpuVendor: gpuVendor, + cgroupMountPoint: cgroupMountPoint, + gpuVendor: gpuVendor, }, nil } func (s *MetricsCollector) GetSystemMetrics(ctx context.Context) (*schemas.SystemMetrics, error) { + // It's possible to move a process from one control group to another (it's unlikely, but nonetheless), + // so we detect the current group each time. + cgroupPathname, err := getProcessCgroupPathname(ctx, "/proc/self/cgroup") + if err != nil { + return nil, fmt.Errorf("get cgroup pathname: %w", err) + } + cgroupPath := path.Join(s.cgroupMountPoint, cgroupPathname) timestamp := time.Now() - cpuUsage, err := s.GetCPUUsageMicroseconds() + cpuUsage, err := s.GetCPUUsageMicroseconds(cgroupPath) if err != nil { return nil, err } - memoryUsage, err := s.GetMemoryUsageBytes() + memoryUsage, err := s.GetMemoryUsageBytes(cgroupPath) if err != nil { return nil, err } - memoryCache, err := s.GetMemoryCacheBytes() + memoryCache, err := s.GetMemoryCacheBytes(cgroupPath) if err != nil { return nil, err } @@ -61,28 +71,14 @@ func (s *MetricsCollector) GetSystemMetrics(ctx context.Context) (*schemas.Syste }, nil } -func (s *MetricsCollector) GetCPUUsageMicroseconds() (uint64, error) { - cgroupCPUUsagePath := "/sys/fs/cgroup/cpu.stat" - if s.cgroupVersion == 1 { - cgroupCPUUsagePath = "/sys/fs/cgroup/cpuacct/cpuacct.usage" - } +func (s *MetricsCollector) GetCPUUsageMicroseconds(cgroupPath string) (uint64, error) { + cgroupCPUUsagePath := path.Join(cgroupPath, "cpu.stat") data, err := os.ReadFile(cgroupCPUUsagePath) if err != nil { return 0, fmt.Errorf("could not read CPU usage: %w", err) } - if s.cgroupVersion == 1 { - // cgroup v1 provides usage in nanoseconds - usageStr := strings.TrimSpace(string(data)) - cpuUsage, err := strconv.ParseUint(usageStr, 10, 64) - if err != nil { - return 0, fmt.Errorf("could not parse CPU usage: %w", err) - } - // convert nanoseconds to microseconds - return cpuUsage / 1000, nil - } - // cgroup v2, we need to extract usage_usec from cpu.stat lines := strings.Split(string(data), "\n") for _, line := range lines { if strings.HasPrefix(line, "usage_usec") { @@ -100,11 +96,8 @@ func (s *MetricsCollector) GetCPUUsageMicroseconds() (uint64, error) { return 0, fmt.Errorf("usage_usec not found in cpu.stat") } -func (s *MetricsCollector) GetMemoryUsageBytes() (uint64, error) { - cgroupMemoryUsagePath := "/sys/fs/cgroup/memory.current" - if s.cgroupVersion == 1 { - cgroupMemoryUsagePath = "/sys/fs/cgroup/memory/memory.usage_in_bytes" - } +func (s *MetricsCollector) GetMemoryUsageBytes(cgroupPath string) (uint64, error) { + cgroupMemoryUsagePath := path.Join(cgroupPath, "memory.current") data, err := os.ReadFile(cgroupMemoryUsagePath) if err != nil { @@ -119,11 +112,8 @@ func (s *MetricsCollector) GetMemoryUsageBytes() (uint64, error) { return usedMemory, nil } -func (s *MetricsCollector) GetMemoryCacheBytes() (uint64, error) { - cgroupMemoryStatPath := "/sys/fs/cgroup/memory.stat" - if s.cgroupVersion == 1 { - cgroupMemoryStatPath = "/sys/fs/cgroup/memory/memory.stat" - } +func (s *MetricsCollector) GetMemoryCacheBytes(cgroupPath string) (uint64, error) { + cgroupMemoryStatPath := path.Join(cgroupPath, "memory.stat") statData, err := os.ReadFile(cgroupMemoryStatPath) if err != nil { @@ -132,8 +122,7 @@ func (s *MetricsCollector) GetMemoryCacheBytes() (uint64, error) { lines := strings.Split(string(statData), "\n") for _, line := range lines { - if (s.cgroupVersion == 1 && strings.HasPrefix(line, "total_inactive_file")) || - (s.cgroupVersion == 2 && strings.HasPrefix(line, "inactive_file")) { + if strings.HasPrefix(line, "inactive_file") { parts := strings.Fields(line) if len(parts) != 2 { return 0, fmt.Errorf("unexpected format in memory.stat") @@ -255,23 +244,6 @@ func (s *MetricsCollector) GetIntelAcceleratorMetrics(ctx context.Context) ([]sc return parseNVIDIASMILikeMetrics(out.String()) } -func getCgroupVersion() (int, error) { - data, err := os.ReadFile("/proc/self/mountinfo") - if err != nil { - return 0, fmt.Errorf("could not read /proc/self/mountinfo: %w", err) - } - - for _, line := range strings.Split(string(data), "\n") { - if strings.Contains(line, "cgroup2") { - return 2, nil - } else if strings.Contains(line, "cgroup") { - return 1, nil - } - } - - return 0, fmt.Errorf("could not determine cgroup version") -} - func parseNVIDIASMILikeMetrics(output string) ([]schemas.GPUMetrics, error) { metrics := []schemas.GPUMetrics{} diff --git a/runner/internal/metrics/metrics_test.go b/runner/internal/metrics/metrics_test.go index d547e2e330..152f31c1b7 100644 --- a/runner/internal/metrics/metrics_test.go +++ b/runner/internal/metrics/metrics_test.go @@ -12,7 +12,7 @@ func TestGetAMDGPUMetrics_OK(t *testing.T) { if runtime.GOOS == "darwin" { t.Skip("Skipping on macOS") } - collector, err := NewMetricsCollector() + collector, err := NewMetricsCollector(t.Context()) assert.NoError(t, err) cases := []struct { @@ -46,7 +46,7 @@ func TestGetAMDGPUMetrics_ErrorGPUUtilNA(t *testing.T) { if runtime.GOOS == "darwin" { t.Skip("Skipping on macOS") } - collector, err := NewMetricsCollector() + collector, err := NewMetricsCollector(t.Context()) assert.NoError(t, err) metrics, err := collector.getAMDGPUMetrics("gpu,gfx,gfx_clock,vram_used,vram_total\n0,N/A,N/A,283,196300\n") assert.ErrorContains(t, err, "GPU utilization is N/A") diff --git a/runner/internal/runner/api/http.go b/runner/internal/runner/api/http.go index ac13b5e5b4..bbf416efbe 100644 --- a/runner/internal/runner/api/http.go +++ b/runner/internal/runner/api/http.go @@ -16,7 +16,6 @@ import ( "github.com/dstackai/dstack/runner/internal/api" "github.com/dstackai/dstack/runner/internal/executor" "github.com/dstackai/dstack/runner/internal/log" - "github.com/dstackai/dstack/runner/internal/metrics" "github.com/dstackai/dstack/runner/internal/schemas" ) @@ -28,11 +27,10 @@ func (s *Server) healthcheckGetHandler(w http.ResponseWriter, r *http.Request) ( } func (s *Server) metricsGetHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { - metricsCollector, err := metrics.NewMetricsCollector() - if err != nil { - return nil, &api.Error{Status: http.StatusInternalServerError, Err: err} + if s.metricsCollector == nil { + return nil, &api.Error{Status: http.StatusNotFound, Msg: "Metrics collector is not available"} } - metrics, err := metricsCollector.GetSystemMetrics(r.Context()) + metrics, err := s.metricsCollector.GetSystemMetrics(r.Context()) if err != nil { return nil, &api.Error{Status: http.StatusInternalServerError, Err: err} } diff --git a/runner/internal/runner/api/server.go b/runner/internal/runner/api/server.go index be573cc663..9d98315b1b 100644 --- a/runner/internal/runner/api/server.go +++ b/runner/internal/runner/api/server.go @@ -12,6 +12,7 @@ import ( "github.com/dstackai/dstack/runner/internal/api" "github.com/dstackai/dstack/runner/internal/executor" "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/metrics" ) type Server struct { @@ -29,15 +30,23 @@ type Server struct { executor executor.Executor cancelRun context.CancelFunc + metricsCollector *metrics.MetricsCollector + version string } -func NewServer(tempDir string, homeDir string, address string, sshPort int, version string) (*Server, error) { +func NewServer(ctx context.Context, tempDir string, homeDir string, address string, sshPort int, version string) (*Server, error) { r := api.NewRouter() ex, err := executor.NewRunExecutor(tempDir, homeDir, sshPort) if err != nil { return nil, err } + + metricsCollector, err := metrics.NewMetricsCollector(ctx) + if err != nil { + log.Warning(ctx, "Metrics collector is not available", "err", err) + } + s := &Server{ srv: &http.Server{ Addr: address, @@ -55,6 +64,8 @@ func NewServer(tempDir string, homeDir string, address string, sshPort int, vers executor: ex, + metricsCollector: metricsCollector, + version: version, } r.AddHandler("GET", "/api/healthcheck", s.healthcheckGetHandler) diff --git a/src/dstack/_internal/server/background/tasks/process_metrics.py b/src/dstack/_internal/server/background/tasks/process_metrics.py index d2197d4229..ca2d25fe5f 100644 --- a/src/dstack/_internal/server/background/tasks/process_metrics.py +++ b/src/dstack/_internal/server/background/tasks/process_metrics.py @@ -140,8 +140,12 @@ async def _collect_job_metrics(job_model: JobModel) -> Optional[JobMetricsPoint] return None if res is None: - logger.warning( - "Failed to collect job %s metrics. Runner version does not support metrics API.", + logger.debug( + ( + "Failed to collect job %s metrics." + " Either runner version does not support metrics API" + " or metrics collector is not available." + ), job_model.job_name, ) return None From 635c38dae19b639f34f1b95e47c8678668beb556 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Mon, 22 Dec 2025 09:10:26 +0100 Subject: [PATCH 009/187] [Internal] Add an index for user email (#3409) --- .../1aa9638ad963_added_email_index.py | 31 +++++++++++++++++++ src/dstack/_internal/server/models.py | 2 +- 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 src/dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py diff --git a/src/dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py b/src/dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py new file mode 100644 index 0000000000..3b5a9d8b5c --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py @@ -0,0 +1,31 @@ +"""Added email index + +Revision ID: 1aa9638ad963 +Revises: 22d74df9897e +Create Date: 2025-12-21 22:08:27.331645 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "1aa9638ad963" +down_revision = "22d74df9897e" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_users_email"), ["email"], unique=False) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("users", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_users_email")) + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index 22a70eceb3..33cd689e44 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -201,7 +201,7 @@ class UserModel(BaseModel): ssh_private_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True) ssh_public_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True) - email: Mapped[Optional[str]] = mapped_column(String(200), nullable=True) + email: Mapped[Optional[str]] = mapped_column(String(200), nullable=True, index=True) projects_quota: Mapped[int] = mapped_column( Integer, default=settings.USER_PROJECT_DEFAULT_QUOTA From 139a9adf265f9bb5aaa277cc8bcfcc24ece338a7 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Mon, 22 Dec 2025 08:54:59 +0000 Subject: [PATCH 010/187] Don't send asyncio.CancelledError to Sentry (#3404) --- src/dstack/_internal/server/app.py | 2 ++ src/dstack/_internal/server/utils/sentry_utils.py | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/src/dstack/_internal/server/app.py b/src/dstack/_internal/server/app.py index 736733b403..9c83bac793 100644 --- a/src/dstack/_internal/server/app.py +++ b/src/dstack/_internal/server/app.py @@ -58,6 +58,7 @@ SERVER_URL, UPDATE_DEFAULT_PROJECT, ) +from dstack._internal.server.utils import sentry_utils from dstack._internal.server.utils.logging import configure_logging from dstack._internal.server.utils.routers import ( CustomORJSONResponse, @@ -105,6 +106,7 @@ async def lifespan(app: FastAPI): enable_tracing=True, traces_sampler=_sentry_traces_sampler, profiles_sample_rate=settings.SENTRY_PROFILES_SAMPLE_RATE, + before_send=sentry_utils.AsyncioCancelledErrorFilterEventProcessor(), ) server_executor = ThreadPoolExecutor(max_workers=settings.SERVER_EXECUTOR_MAX_WORKERS) asyncio.get_running_loop().set_default_executor(server_executor) diff --git a/src/dstack/_internal/server/utils/sentry_utils.py b/src/dstack/_internal/server/utils/sentry_utils.py index c878e1e912..8dd7326b73 100644 --- a/src/dstack/_internal/server/utils/sentry_utils.py +++ b/src/dstack/_internal/server/utils/sentry_utils.py @@ -1,6 +1,9 @@ +import asyncio import functools +from typing import Optional import sentry_sdk +from sentry_sdk.types import Event, Hint def instrument_background_task(f): @@ -10,3 +13,12 @@ async def wrapper(*args, **kwargs): return await f(*args, **kwargs) return wrapper + + +class AsyncioCancelledErrorFilterEventProcessor: + # See https://docs.sentry.io/platforms/python/configuration/filtering/#filtering-error-events + def __call__(self, event: Event, hint: Hint) -> Optional[Event]: + exc_info = hint.get("exc_info") + if exc_info and isinstance(exc_info[1], asyncio.CancelledError): + return None + return event From 018b40e51d2808cfa3028f1434ad5b710bcf602a Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Mon, 22 Dec 2025 10:24:27 +0100 Subject: [PATCH 011/187] [Internal] Allow passing `AnyActor` to `update_user` (#3410) --- src/dstack/_internal/server/routers/users.py | 4 ++-- src/dstack/_internal/server/services/users.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dstack/_internal/server/routers/users.py b/src/dstack/_internal/server/routers/users.py index 1feac5da36..6030416f50 100644 --- a/src/dstack/_internal/server/routers/users.py +++ b/src/dstack/_internal/server/routers/users.py @@ -15,7 +15,7 @@ UpdateUserRequest, ) from dstack._internal.server.security.permissions import Authenticated, GlobalAdmin -from dstack._internal.server.services import users +from dstack._internal.server.services import events, users from dstack._internal.server.utils.routers import ( CustomORJSONResponse, get_base_api_additional_responses, @@ -86,7 +86,7 @@ async def update_user( ): res = await users.update_user( session=session, - actor=user, + actor=events.UserActor.from_user(user), username=body.username, global_role=body.global_role, email=body.email, diff --git a/src/dstack/_internal/server/services/users.py b/src/dstack/_internal/server/services/users.py index e8fbcde782..3f8f6afa7b 100644 --- a/src/dstack/_internal/server/services/users.py +++ b/src/dstack/_internal/server/services/users.py @@ -130,7 +130,7 @@ async def create_user( async def update_user( session: AsyncSession, - actor: UserModel, + actor: events.AnyActor, username: str, global_role: GlobalRole, email: Optional[str] = None, @@ -152,7 +152,7 @@ async def update_user( events.emit( session, f"User updated. Updated fields: {', '.join(updated_fields) or ''}", - actor=events.UserActor.from_user(actor), + actor=actor, targets=[events.Target.from_model(user)], ) await session.commit() From 8ee924ee05c4332b018a8c38207b9dec1d6241ed Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Mon, 22 Dec 2025 12:51:53 +0100 Subject: [PATCH 012/187] Replace `Instance.termination_reason` values with codes (#3187) Co-authored-by: Jvst Me --- src/dstack/_internal/core/models/instances.py | 70 +++++++++++++++++++ .../server/background/tasks/process_fleets.py | 5 +- .../background/tasks/process_instances.py | 50 +++++++++---- ...dd_instances_termination_reason_message.py | 34 +++++++++ src/dstack/_internal/server/models.py | 43 ++++++++++-- .../_internal/server/services/instances.py | 5 +- .../tasks/test_process_instances.py | 29 ++++---- .../_internal/server/routers/test_fleets.py | 4 ++ .../server/routers/test_instances.py | 23 ++++++ 9 files changed, 225 insertions(+), 38 deletions(-) create mode 100644 src/dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py diff --git a/src/dstack/_internal/core/models/instances.py b/src/dstack/_internal/core/models/instances.py index bfe01c98bc..2bc0c1f898 100644 --- a/src/dstack/_internal/core/models/instances.py +++ b/src/dstack/_internal/core/models/instances.py @@ -15,6 +15,9 @@ from dstack._internal.core.models.health import HealthStatus from dstack._internal.core.models.volumes import Volume from dstack._internal.utils.common import pretty_resources +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) class Gpu(CoreModel): @@ -254,6 +257,70 @@ def finished_statuses(cls) -> List["InstanceStatus"]: return [cls.TERMINATING, cls.TERMINATED] +class InstanceTerminationReason(str, Enum): + IDLE_TIMEOUT = "idle_timeout" + PROVISIONING_TIMEOUT = "provisioning_timeout" + ERROR = "error" + JOB_FINISHED = "job_finished" + UNREACHABLE = "unreachable" + NO_OFFERS = "no_offers" + MASTER_FAILED = "master_failed" + MAX_INSTANCES_LIMIT = "max_instances_limit" + NO_BALANCE = "no_balance" # used in dstack Sky + + @classmethod + def from_legacy_str(cls, v: str) -> "InstanceTerminationReason": + """ + Convert legacy termination reason string to relevant termination reason enum. + + dstack versions prior to 0.20.1 represented instance termination reasons as raw + strings. Such strings may still be stored in the database. + """ + + if v == "Idle timeout": + return cls.IDLE_TIMEOUT + if v in ( + "Instance has not become running in time", + "Provisioning timeout expired", + "Proivisioning timeout expired", # typo is intentional + "The proivisioning timeout expired", # typo is intentional + ): + return cls.PROVISIONING_TIMEOUT + if v in ( + "Unsupported private SSH key type", + "Failed to locate internal IP address on the given network", + "Specified internal IP not found among instance interfaces", + "Cannot split into blocks", + "Backend not available", + "Error while waiting for instance to become running", + "Empty profile, requirements or instance_configuration", + "Unable to locate the internal ip-address for the given network", + "Private SSH key is encrypted, password required", + "Cannot parse private key, key type is not supported", + ) or v.startswith("Error to parse profile, requirements or instance_configuration:"): + return cls.ERROR + if v in ( + "All offers failed", + "No offers found", + "There were no offers found", + "Retry duration expired", + "The retry's duration expired", + ): + return cls.NO_OFFERS + if v == "Master instance failed to start": + return cls.MASTER_FAILED + if v == "Instance job finished": + return cls.JOB_FINISHED + if v == "Termination deadline": + return cls.UNREACHABLE + if v == "Fleet has too many instances": + return cls.MAX_INSTANCES_LIMIT + if v == "Low account balance": + return cls.NO_BALANCE + logger.warning("Unexpected instance termination reason string: %r", v) + return cls.ERROR + + class Instance(CoreModel): id: UUID project_name: str @@ -268,7 +335,10 @@ class Instance(CoreModel): status: InstanceStatus unreachable: bool = False health_status: HealthStatus = HealthStatus.HEALTHY + # termination_reason stores InstanceTerminationReason. + # str allows adding new enum members without breaking compatibility with old clients. termination_reason: Optional[str] = None + termination_reason_message: Optional[str] = None created: datetime.datetime region: Optional[str] = None availability_zone: Optional[str] = None diff --git a/src/dstack/_internal/server/background/tasks/process_fleets.py b/src/dstack/_internal/server/background/tasks/process_fleets.py index ffa83e10d7..733029abf8 100644 --- a/src/dstack/_internal/server/background/tasks/process_fleets.py +++ b/src/dstack/_internal/server/background/tasks/process_fleets.py @@ -8,7 +8,7 @@ from sqlalchemy.orm import joinedload, load_only, selectinload from dstack._internal.core.models.fleets import FleetSpec, FleetStatus -from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason from dstack._internal.server.db import get_db, get_session_ctx from dstack._internal.server.models import ( FleetModel, @@ -213,7 +213,8 @@ def _maintain_fleet_nodes_in_min_max_range( break if instance.status in [InstanceStatus.IDLE]: instance.status = InstanceStatus.TERMINATING - instance.termination_reason = "Fleet has too many instances" + instance.termination_reason = InstanceTerminationReason.MAX_INSTANCES_LIMIT + instance.termination_reason_message = "Fleet has too many instances" nodes_redundant -= 1 logger.info( "Terminating instance %s: %s", diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index 7d54171765..4b45e68b13 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -47,6 +47,7 @@ InstanceOfferWithAvailability, InstanceRuntime, InstanceStatus, + InstanceTerminationReason, RemoteConnectionInfo, SSHKey, ) @@ -274,7 +275,7 @@ def _check_and_mark_terminating_if_idle_duration_expired(instance: InstanceModel delta = datetime.timedelta(seconds=idle_seconds) if idle_duration > delta: instance.status = InstanceStatus.TERMINATING - instance.termination_reason = "Idle timeout" + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT logger.info( "Instance %s idle duration expired: idle time %ss. Terminating", instance.name, @@ -310,7 +311,7 @@ async def _add_remote(instance: InstanceModel) -> None: retry_duration_deadline = instance.created_at + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS) if retry_duration_deadline < get_current_datetime(): instance.status = InstanceStatus.TERMINATED - instance.termination_reason = "Provisioning timeout expired" + instance.termination_reason = InstanceTerminationReason.PROVISIONING_TIMEOUT logger.warning( "Failed to start instance %s in %d seconds. Terminating...", instance.name, @@ -333,7 +334,8 @@ async def _add_remote(instance: InstanceModel) -> None: ssh_proxy_pkeys = None except (ValueError, PasswordRequiredException): instance.status = InstanceStatus.TERMINATED - instance.termination_reason = "Unsupported private SSH key type" + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = "Unsupported private SSH key type" logger.warning( "Failed to add instance %s: unsupported private SSH key type", instance.name, @@ -391,7 +393,10 @@ async def _add_remote(instance: InstanceModel) -> None: ) if instance_network is not None and internal_ip is None: instance.status = InstanceStatus.TERMINATED - instance.termination_reason = "Failed to locate internal IP address on the given network" + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = ( + "Failed to locate internal IP address on the given network" + ) logger.warning( "Failed to add instance %s: failed to locate internal IP address on the given network", instance.name, @@ -404,7 +409,8 @@ async def _add_remote(instance: InstanceModel) -> None: if internal_ip is not None: if not is_ip_among_addresses(ip_address=internal_ip, addresses=host_network_addresses): instance.status = InstanceStatus.TERMINATED - instance.termination_reason = ( + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = ( "Specified internal IP not found among instance interfaces" ) logger.warning( @@ -426,7 +432,8 @@ async def _add_remote(instance: InstanceModel) -> None: instance.total_blocks = blocks else: instance.status = InstanceStatus.TERMINATED - instance.termination_reason = "Cannot split into blocks" + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = "Cannot split into blocks" logger.warning( "Failed to add instance %s: cannot split into blocks", instance.name, @@ -545,7 +552,8 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No requirements = get_instance_requirements(instance) except ValidationError as e: instance.status = InstanceStatus.TERMINATED - instance.termination_reason = ( + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = ( f"Error to parse profile, requirements or instance_configuration: {e}" ) logger.warning( @@ -671,19 +679,28 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No ) return - _mark_terminated(instance, "All offers failed" if offers else "No offers found") + _mark_terminated( + instance, + InstanceTerminationReason.NO_OFFERS, + "All offers failed" if offers else "No offers found", + ) if instance.fleet and is_fleet_master_instance(instance) and is_cloud_cluster(instance.fleet): # Do not attempt to deploy other instances, as they won't determine the correct cluster # backend, region, and placement group without a successfully deployed master instance for sibling_instance in instance.fleet.instances: if sibling_instance.id == instance.id: continue - _mark_terminated(sibling_instance, "Master instance failed to start") + _mark_terminated(sibling_instance, InstanceTerminationReason.MASTER_FAILED) -def _mark_terminated(instance: InstanceModel, termination_reason: str) -> None: +def _mark_terminated( + instance: InstanceModel, + termination_reason: InstanceTerminationReason, + termination_reason_message: Optional[str] = None, +) -> None: instance.status = InstanceStatus.TERMINATED instance.termination_reason = termination_reason + instance.termination_reason_message = termination_reason_message logger.info( "Terminated instance %s: %s", instance.name, @@ -703,7 +720,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non ): # A busy instance could have no active jobs due to this bug: https://github.com/dstackai/dstack/issues/2068 instance.status = InstanceStatus.TERMINATING - instance.termination_reason = "Instance job finished" + instance.termination_reason = InstanceTerminationReason.JOB_FINISHED logger.info( "Detected busy instance %s with finished job. Marked as TERMINATING", instance.name, @@ -832,7 +849,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non deadline = instance.termination_deadline if get_current_datetime() > deadline: instance.status = InstanceStatus.TERMINATING - instance.termination_reason = "Termination deadline" + instance.termination_reason = InstanceTerminationReason.UNREACHABLE logger.warning( "Instance %s shim waiting timeout. Marked as TERMINATING", instance.name, @@ -861,7 +878,8 @@ async def _wait_for_instance_provisioning_data( "Instance %s failed because instance has not become running in time", instance.name ) instance.status = InstanceStatus.TERMINATING - instance.termination_reason = "Instance has not become running in time" + instance.termination_reason = InstanceTerminationReason.PROVISIONING_TIMEOUT + instance.termination_reason_message = "Backend did not complete provisioning in time" return backend = await backends_services.get_project_backend_by_type( @@ -874,7 +892,8 @@ async def _wait_for_instance_provisioning_data( instance.name, ) instance.status = InstanceStatus.TERMINATING - instance.termination_reason = "Backend not available" + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = "Backend not available" return try: await run_async( @@ -891,7 +910,8 @@ async def _wait_for_instance_provisioning_data( repr(e), ) instance.status = InstanceStatus.TERMINATING - instance.termination_reason = "Error while waiting for instance to become running" + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = "Error while waiting for instance to become running" except Exception: logger.exception( "Got exception when updating instance %s provisioning data", instance.name diff --git a/src/dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py b/src/dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py new file mode 100644 index 0000000000..ff025fa2ba --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py @@ -0,0 +1,34 @@ +"""Add instances.termination_reason_message + +Revision ID: 903c91e24634 +Revises: 1aa9638ad963 +Create Date: 2025-12-22 12:17:58.573457 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "903c91e24634" +down_revision = "1aa9638ad963" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column( + sa.Column("termination_reason_message", sa.String(length=4000), nullable=True) + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_column("termination_reason_message") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index 33cd689e44..5274d9ebfd 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -1,7 +1,7 @@ import enum import uuid from datetime import datetime, timezone -from typing import Callable, List, Optional, Union +from typing import Callable, Generic, List, Optional, TypeVar, Union from sqlalchemy import ( BigInteger, @@ -30,7 +30,7 @@ from dstack._internal.core.models.fleets import FleetStatus from dstack._internal.core.models.gateways import GatewayStatus from dstack._internal.core.models.health import HealthStatus -from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason from dstack._internal.core.models.profiles import ( DEFAULT_FLEET_TERMINATION_IDLE_TIME, TerminationPolicy, @@ -141,7 +141,10 @@ def process_result_value(self, value: Optional[str], dialect) -> Optional[Decryp return DecryptedString(plaintext=None, decrypted=False, exc=e) -class EnumAsString(TypeDecorator): +E = TypeVar("E", bound=enum.Enum) + + +class EnumAsString(TypeDecorator, Generic[E]): """ A custom type decorator that stores enums as strings in the DB. """ @@ -149,18 +152,34 @@ class EnumAsString(TypeDecorator): impl = String cache_ok = True - def __init__(self, enum_class: type[enum.Enum], *args, **kwargs): + def __init__( + self, + enum_class: type[E], + *args, + fallback_deserializer: Optional[Callable[[str], E]] = None, + **kwargs, + ): + """ + Args: + enum_class: The enum class to be stored. + fallback_deserializer: An optional function used when the string + from the DB does not match any enum member name. If not + provided, an exception will be raised in such cases. + """ self.enum_class = enum_class + self.fallback_deserializer = fallback_deserializer super().__init__(*args, **kwargs) - def process_bind_param(self, value: Optional[enum.Enum], dialect) -> Optional[str]: + def process_bind_param(self, value: Optional[E], dialect) -> Optional[str]: if value is None: return None return value.name - def process_result_value(self, value: Optional[str], dialect) -> Optional[enum.Enum]: + def process_result_value(self, value: Optional[str], dialect) -> Optional[E]: if value is None: return None + if value not in self.enum_class.__members__ and self.fallback_deserializer is not None: + return self.fallback_deserializer(value) return self.enum_class[value] @@ -641,7 +660,17 @@ class InstanceModel(BaseModel): # instance termination handling termination_deadline: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - termination_reason: Mapped[Optional[str]] = mapped_column(String(4000)) + # dstack versions prior to 0.20.1 represented instance termination reasons as raw strings. + # Such strings may still be stored in the database, so we are using a wide column (4000 chars) + # and a fallback deserializer to convert them to relevant enum members. + termination_reason: Mapped[Optional[InstanceTerminationReason]] = mapped_column( + EnumAsString( + InstanceTerminationReason, + 4000, + fallback_deserializer=InstanceTerminationReason.from_legacy_str, + ) + ) + termination_reason_message: Mapped[Optional[str]] = mapped_column(String(4000)) # Deprecated since 0.19.22, not used health_status: Mapped[Optional[str]] = mapped_column(String(4000), deferred=True) health: Mapped[HealthStatus] = mapped_column( diff --git a/src/dstack/_internal/server/services/instances.py b/src/dstack/_internal/server/services/instances.py index 56459efd78..bf837469d0 100644 --- a/src/dstack/_internal/server/services/instances.py +++ b/src/dstack/_internal/server/services/instances.py @@ -128,7 +128,10 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance: status=instance_model.status, unreachable=instance_model.unreachable, health_status=instance_model.health, - termination_reason=instance_model.termination_reason, + termination_reason=( + instance_model.termination_reason.value if instance_model.termination_reason else None + ), + termination_reason_message=instance_model.termination_reason_message, created=instance_model.created_at, total_blocks=instance_model.total_blocks, busy_blocks=instance_model.busy_blocks, diff --git a/src/tests/_internal/server/background/tasks/test_process_instances.py b/src/tests/_internal/server/background/tasks/test_process_instances.py index cb5028c42b..bed206e92a 100644 --- a/src/tests/_internal/server/background/tasks/test_process_instances.py +++ b/src/tests/_internal/server/background/tasks/test_process_instances.py @@ -29,6 +29,7 @@ InstanceOffer, InstanceOfferWithAvailability, InstanceStatus, + InstanceTerminationReason, InstanceType, Resources, ) @@ -262,7 +263,7 @@ async def test_check_shim_terminate_instance_by_deadline(self, test_db, session: assert instance is not None assert instance.status == InstanceStatus.TERMINATING assert instance.termination_deadline == termination_deadline_time - assert instance.termination_reason == "Termination deadline" + assert instance.termination_reason == InstanceTerminationReason.UNREACHABLE @pytest.mark.asyncio @pytest.mark.parametrize( @@ -529,7 +530,7 @@ async def test_terminate_by_idle_timeout(self, test_db, session: AsyncSession): await session.refresh(instance) assert instance is not None assert instance.status == InstanceStatus.TERMINATING - assert instance.termination_reason == "Idle timeout" + assert instance.termination_reason == InstanceTerminationReason.IDLE_TIMEOUT class TestSSHInstanceTerminateProvisionTimeoutExpired: @@ -550,7 +551,7 @@ async def test_terminate_by_idle_timeout(self, test_db, session: AsyncSession): await session.refresh(instance) assert instance.status == InstanceStatus.TERMINATED - assert instance.termination_reason == "Provisioning timeout expired" + assert instance.termination_reason == InstanceTerminationReason.PROVISIONING_TIMEOUT class TestTerminate: @@ -575,8 +576,7 @@ async def test_terminate(self, test_db, session: AsyncSession): instance = await create_instance( session=session, project=project, status=InstanceStatus.TERMINATING ) - reason = "some reason" - instance.termination_reason = reason + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19) await session.commit() @@ -588,7 +588,7 @@ async def test_terminate(self, test_db, session: AsyncSession): assert instance is not None assert instance.status == InstanceStatus.TERMINATED - assert instance.termination_reason == "some reason" + assert instance.termination_reason == InstanceTerminationReason.IDLE_TIMEOUT assert instance.deleted == True assert instance.deleted_at is not None assert instance.finished_at is not None @@ -603,7 +603,7 @@ async def test_terminate_retry(self, test_db, session: AsyncSession, error: Exce instance = await create_instance( session=session, project=project, status=InstanceStatus.TERMINATING ) - instance.termination_reason = "some reason" + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc) instance.last_job_processed_at = initial_time await session.commit() @@ -635,7 +635,7 @@ async def test_terminate_not_retries_if_too_early(self, test_db, session: AsyncS instance = await create_instance( session=session, project=project, status=InstanceStatus.TERMINATING ) - instance.termination_reason = "some reason" + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc) instance.last_job_processed_at = initial_time await session.commit() @@ -667,7 +667,7 @@ async def test_terminate_on_termination_deadline(self, test_db, session: AsyncSe instance = await create_instance( session=session, project=project, status=InstanceStatus.TERMINATING ) - instance.termination_reason = "some reason" + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc) instance.last_job_processed_at = initial_time await session.commit() @@ -819,7 +819,7 @@ async def test_fails_if_all_offers_fail(self, session: AsyncSession, err: Except await session.refresh(instance) assert instance.status == InstanceStatus.TERMINATED - assert instance.termination_reason == "All offers failed" + assert instance.termination_reason == InstanceTerminationReason.NO_OFFERS async def test_fails_if_no_offers(self, session: AsyncSession): project = await create_project(session=session) @@ -832,19 +832,22 @@ async def test_fails_if_no_offers(self, session: AsyncSession): await session.refresh(instance) assert instance.status == InstanceStatus.TERMINATED - assert instance.termination_reason == "No offers found" + assert instance.termination_reason == InstanceTerminationReason.NO_OFFERS @pytest.mark.parametrize( ("placement", "expected_termination_reasons"), [ pytest.param( InstanceGroupPlacement.CLUSTER, - {"No offers found": 1, "Master instance failed to start": 3}, + { + InstanceTerminationReason.NO_OFFERS: 1, + InstanceTerminationReason.MASTER_FAILED: 3, + }, id="cluster", ), pytest.param( None, - {"No offers found": 4}, + {InstanceTerminationReason.NO_OFFERS: 4}, id="non-cluster", ), ], diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py index c5b8b7079a..12e439111e 100644 --- a/src/tests/_internal/server/routers/test_fleets.py +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -401,6 +401,7 @@ async def test_creates_fleet(self, test_db, session: AsyncSession, client: Async "unreachable": False, "health_status": "healthy", "termination_reason": None, + "termination_reason_message": None, "created": "2023-01-02T03:04:00+00:00", "backend": None, "region": None, @@ -536,6 +537,7 @@ async def test_creates_ssh_fleet(self, test_db, session: AsyncSession, client: A "unreachable": False, "health_status": "healthy", "termination_reason": None, + "termination_reason_message": None, "created": "2023-01-02T03:04:00+00:00", "region": "remote", "availability_zone": None, @@ -709,6 +711,7 @@ async def test_updates_ssh_fleet(self, test_db, session: AsyncSession, client: A "unreachable": False, "health_status": "healthy", "termination_reason": None, + "termination_reason_message": None, "created": "2023-01-02T03:04:00+00:00", "region": "remote", "availability_zone": None, @@ -742,6 +745,7 @@ async def test_updates_ssh_fleet(self, test_db, session: AsyncSession, client: A "unreachable": False, "health_status": "healthy", "termination_reason": None, + "termination_reason_message": None, "created": "2023-01-02T03:04:00+00:00", "region": "remote", "availability_zone": None, diff --git a/src/tests/_internal/server/routers/test_instances.py b/src/tests/_internal/server/routers/test_instances.py index f4fe924e4d..8aee09e6d8 100644 --- a/src/tests/_internal/server/routers/test_instances.py +++ b/src/tests/_internal/server/routers/test_instances.py @@ -6,6 +6,7 @@ import pytest import pytest_asyncio from httpx import AsyncClient +from sqlalchemy import text from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.core.models.instances import InstanceStatus @@ -372,3 +373,25 @@ async def test_returns_health_checks(self, session: AsyncSession, client: AsyncC {"collected_at": "2025-01-01T12:00:00+00:00", "status": "healthy", "events": []}, ] } + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("test_db") +class TestCompatibility: + async def test_converts_legacy_termination_reason_string( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session) + project = await create_project(session, owner=user) + fleet = await create_fleet(session, project) + await create_instance(session=session, project=project, fleet=fleet) + await session.execute( + text("UPDATE instances SET termination_reason = 'Fleet has too many instances'") + ) + await session.commit() + resp = await client.post( + "/api/instances/list", headers=get_auth_headers(user.token), json={} + ) + # Must convert legacy "Fleet has too many instances" to "max_instances_limit" + assert resp.json()[0]["termination_reason"] == "max_instances_limit" From eb5935422f6464bbe004843f01c9854e853108e0 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Mon, 22 Dec 2025 13:18:23 +0100 Subject: [PATCH 013/187] [Docs] Added the `Lambda` example under `Clusters` (#3407) --- docs/examples.md | 10 ++ docs/examples/clusters/lambda/index.md | 0 examples/clusters/lambda/README.md | 217 +++++++++++++++++++++++++ mkdocs.yml | 1 + 4 files changed, 228 insertions(+) create mode 100644 docs/examples/clusters/lambda/index.md create mode 100644 examples/clusters/lambda/README.md diff --git a/docs/examples.md b/docs/examples.md index 4a369550cf..6032e72a8b 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -100,6 +100,16 @@ hide: Set up AWS EFA clusters with optimized networking

+ +

+ Lambda +

+ +

+ Set up Lambda clusters with optimized networking +

+

diff --git a/docs/examples/clusters/lambda/index.md b/docs/examples/clusters/lambda/index.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/clusters/lambda/README.md b/examples/clusters/lambda/README.md new file mode 100644 index 0000000000..a78465fbac --- /dev/null +++ b/examples/clusters/lambda/README.md @@ -0,0 +1,217 @@ +--- +title: Distributed workload orchestration on Lambda with dstack +--- + +# Lambda + +[Lambda](https://lambda.ai/) offers two ways to use clusters with a fast interconnect: + +* [Kubernetes](#kubernetes) – Lets you interact with clusters through the Kubernetes API and includes support for NVIDIA GPU operators and related tools. +* [1-Click Clusters (1CC)](#1-click-clusters) – Gives you direct access to clusters in the form of bare-metal nodes. + +Both options use the same underlying networking infrastructure. This example walks you through how to set up Lambda clusters to use with `dstack`. + +## Kubernetes + +!!! info "Prerequsisites" + 1. Follow the instructions in [Lambda's guide](https://docs.lambda.ai/public-cloud/1-click-clusters/managed-kubernetes/#accessing-mk8s) on accessing MK8s. + 2. Go to `Firewall` → `Edit rules`, click `Add rule`, and allow ingress traffic on port `30022`. This port will be used by the `dstack` server to access the jump host. + +### Configure the backend + +Follow the standard instructions for setting up a [Kubernetes](https://dstack.ai/docs/concepts/backends/#kubernetes) backend: + +
+ +```yaml +projects: + - name: main + backends: + - type: kubernetes + kubeconfig: + filename: + proxy_jump: + port: 30022 +``` + +
+ +### Create a fleet + +Once the Kubernetes cluster and the `dstack` server are running, you can create a fleet: + +
+ +```yaml +type: fleet +name: lambda-fleet + +placement: cluster +nodes: 0.. + +backends: [kubernetes] + +resources: + # Specify requirements to filter nodes + gpu: 1..8 +``` + +
+ +Pass the fleet configuration to `dstack apply`: + +
+ +```shell +$ dstack apply -f lambda-fleet.dstack.yml +``` + +
+ +Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). + +## 1-Click Clusters + +Another way to work with Lambda clusters is through [1CC](https://lambda.ai/1-click-clusters). While `dstack` supports automated cluster provisioning via [VM-based backends](https://dstack.ai/docs/concepts/backends#vm-based), there is currently no programmatic way to provision Lambda 1CCs. As a result, to use a 1CC cluster with `dstack`, you must use [SSH fleets](https://dstack.ai/docs/concepts/fleets). + +!!! info "Prerequsisites" + 1. Follow the instructions in [Lambda's guide](https://docs.lambda.ai/public-cloud/1-click-clusters/) on working with 1-Click Clusters + +### Create a fleet + +Follow the standard instructions for setting up an [SSH fleet](https://dstack.ai/docs/concepts/fleets/#ssh-fleets): + +
+ +```yaml +type: fleet +name: lambda-fleet + +ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - worker-gpu-8x-b200-rplfm-ll9nr + - worker-gpu-8x-b200-rplfm-qrcs9 + proxy_jump: + hostname: 192.222.55.54 + user: ubuntu + identity_file: ~/.ssh/id_rsa + +placement: cluster +``` + +
+ +> Under `proxy_jump`, we specify the hostname of the head node along with the private SSH key. + +Pass the fleet configuration to `dstack apply`: + +
+ +```shell +$ dstack apply -f lambda-fleet.dstack.yml +``` + +
+ +Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). + +## Run tasks + +To run tasks on a cluster, you must use [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-task). + +### Run NCCL tests + +To validate cluster network bandwidth, use the following task: + +
+ +```yaml +type: task +name: nccl-tests + +nodes: 2 +startup_order: workers-first +stop_criteria: master-done + +commands: + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + -x NCCL_IB_HCA=^mlx5_0 \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 2G -f 2 -t 1 -g 1 -c 1 -n 100 + else + sleep infinity + fi + +# Uncomment if the `kubernetes` backend requires it for `/dev/infiniband` access +#privileged: true + +resources: + gpu: nvidia:B200:8 + shm_size: 16GB +``` + +
+ +Pass the configuration to `dstack apply`: + +
+ +```shell +$ dstack apply -f lambda-nccl-tests.dstack.yml + +Provisioning... +---> 100% + +# nccl-tests version 2.17.6 nccl-headers=22602 nccl-library=22602 +# Collective test starting: all_reduce_perf +# +# size count type redop root time algbw busbw #wrong time algbw busbw #wrong +# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8 2 float sum -1 36.50 0.00 0.00 0 36.16 0.00 0.00 0 + 16 4 float sum -1 35.55 0.00 0.00 0 35.49 0.00 0.00 0 + 32 8 float sum -1 35.49 0.00 0.00 0 36.28 0.00 0.00 0 + 64 16 float sum -1 35.85 0.00 0.00 0 35.54 0.00 0.00 0 + 128 32 float sum -1 37.36 0.00 0.01 0 36.82 0.00 0.01 0 + 256 64 float sum -1 37.38 0.01 0.01 0 37.80 0.01 0.01 0 + 512 128 float sum -1 51.05 0.01 0.02 0 37.17 0.01 0.03 0 + 1024 256 float sum -1 45.33 0.02 0.04 0 37.98 0.03 0.05 0 + 2048 512 float sum -1 38.67 0.05 0.10 0 38.30 0.05 0.10 0 + 4096 1024 float sum -1 40.08 0.10 0.19 0 39.18 0.10 0.20 0 + 8192 2048 float sum -1 42.13 0.19 0.36 0 41.47 0.20 0.37 0 + 16384 4096 float sum -1 43.66 0.38 0.70 0 41.94 0.39 0.73 0 + 32768 8192 float sum -1 45.42 0.72 1.35 0 43.29 0.76 1.42 0 + 65536 16384 float sum -1 44.59 1.47 2.76 0 43.90 1.49 2.80 0 + 131072 32768 float sum -1 47.44 2.76 5.18 0 46.79 2.80 5.25 0 + 262144 65536 float sum -1 66.68 3.93 7.37 0 65.36 4.01 7.52 0 + 524288 131072 float sum -1 240.71 2.18 4.08 0 125.73 4.17 7.82 0 + 1048576 262144 float sum -1 115.58 9.07 17.01 0 115.48 9.08 17.03 0 + 2097152 524288 float sum -1 114.44 18.33 34.36 0 114.27 18.35 34.41 0 + 4194304 1048576 float sum -1 118.25 35.47 66.50 0 117.11 35.82 67.15 0 + 8388608 2097152 float sum -1 141.39 59.33 111.24 0 134.95 62.16 116.55 0 + 16777216 4194304 float sum -1 186.86 89.78 168.34 0 184.39 90.99 170.60 0 + 33554432 8388608 float sum -1 255.79 131.18 245.96 0 253.88 132.16 247.81 0 + 67108864 16777216 float sum -1 350.41 191.52 359.09 0 350.71 191.35 358.79 0 + 134217728 33554432 float sum -1 596.75 224.92 421.72 0 595.37 225.44 422.69 0 + 268435456 67108864 float sum -1 934.67 287.20 538.50 0 931.37 288.22 540.41 0 + 536870912 134217728 float sum -1 1625.63 330.25 619.23 0 1687.31 318.18 596.59 0 + 1073741824 268435456 float sum -1 2972.25 361.26 677.35 0 2971.33 361.37 677.56 0 + 2147483648 536870912 float sum -1 5784.75 371.23 696.06 0 5728.40 374.88 702.91 0 +# Out of bounds values : 0 OK +# Avg bus bandwidth : 137.179 +``` + +
+ +## What's next + +1. Learn about [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), [services](https://dstack.ai/docs/concepts/services) +2. Read the [Kuberentes](https://dstack.ai/docs/guides/kubernetes), and [Clusters](https://dstack.ai/docs/guides/clusters) guides +3. Check Lambda's docs on [Kubernetes](https://docs.lambda.ai/public-cloud/1-click-clusters/managed-kubernetes/#accessing-mk8s) and [1CC](https://docs.lambda.ai/public-cloud/1-click-clusters/) diff --git a/mkdocs.yml b/mkdocs.yml index a3d6d1e230..e793bd23c2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -325,6 +325,7 @@ nav: - Clusters: - AWS: examples/clusters/aws/index.md - GCP: examples/clusters/gcp/index.md + - Lambda: examples/clusters/lambda/index.md - Crusoe: examples/clusters/crusoe/index.md - NCCL/RCCL tests: examples/clusters/nccl-rccl-tests/index.md - Inference: From c391d34d2595d9e2e74a17c2d881cf7a38102b76 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Tue, 23 Dec 2025 09:47:23 +0000 Subject: [PATCH 014/187] [runner] Revamp main.go (#3411) * Migrate to urfave/cli/v3 * Merge cmd.go into main.go * Pass ctx to Server.Run() --- runner/cmd/runner/cmd.go | 79 ------------------------- runner/cmd/runner/main.go | 86 +++++++++++++++++++++++++--- runner/go.mod | 4 -- runner/go.sum | 8 --- runner/internal/runner/api/server.go | 19 +++--- 5 files changed, 89 insertions(+), 107 deletions(-) delete mode 100644 runner/cmd/runner/cmd.go diff --git a/runner/cmd/runner/cmd.go b/runner/cmd/runner/cmd.go deleted file mode 100644 index 08f3d5b018..0000000000 --- a/runner/cmd/runner/cmd.go +++ /dev/null @@ -1,79 +0,0 @@ -package main - -import ( - "log" - "os" - - "github.com/urfave/cli/v2" - - "github.com/dstackai/dstack/runner/consts" -) - -// Version is a build-time variable. The value is overridden by ldflags. -var Version string - -func App() { - var tempDir string - var homeDir string - var httpPort int - var sshPort int - var logLevel int - - app := &cli.App{ - Name: "dstack-runner", - Usage: "configure and start dstack-runner", - Version: Version, - Flags: []cli.Flag{ - &cli.IntFlag{ - Name: "log-level", - Value: 2, - DefaultText: "4 (Info)", - Usage: "log verbosity level: 2 (Error), 3 (Warning), 4 (Info), 5 (Debug), 6 (Trace)", - Destination: &logLevel, - }, - }, - Commands: []*cli.Command{ - { - Name: "start", - Usage: "Start dstack-runner", - Flags: []cli.Flag{ - &cli.PathFlag{ - Name: "temp-dir", - Usage: "Temporary directory for logs and other files", - Value: consts.RunnerTempDir, - Destination: &tempDir, - }, - &cli.PathFlag{ - Name: "home-dir", - Usage: "HomeDir directory for credentials and $HOME", - Value: consts.RunnerHomeDir, - Destination: &homeDir, - }, - &cli.IntFlag{ - Name: "http-port", - Usage: "Set a http port", - Value: consts.RunnerHTTPPort, - Destination: &httpPort, - }, - &cli.IntFlag{ - Name: "ssh-port", - Usage: "Set the ssh port", - Value: consts.RunnerSSHPort, - Destination: &sshPort, - }, - }, - Action: func(c *cli.Context) error { - err := start(tempDir, homeDir, httpPort, sshPort, logLevel, Version) - if err != nil { - return cli.Exit(err, 1) - } - return nil - }, - }, - }, - } - err := app.Run(os.Args) - if err != nil { - log.Fatal(err) - } -} diff --git a/runner/cmd/runner/main.go b/runner/cmd/runner/main.go index 27c07292b9..b34ee7b05a 100644 --- a/runner/cmd/runner/main.go +++ b/runner/cmd/runner/main.go @@ -4,22 +4,94 @@ import ( "context" "fmt" "io" - _ "net/http/pprof" "os" "path/filepath" "github.com/sirupsen/logrus" + "github.com/urfave/cli/v3" "github.com/dstackai/dstack/runner/consts" "github.com/dstackai/dstack/runner/internal/log" "github.com/dstackai/dstack/runner/internal/runner/api" ) +// Version is a build-time variable. The value is overridden by ldflags. +var Version string + func main() { - App() + os.Exit(mainInner()) +} + +func mainInner() int { + var tempDir string + var homeDir string + var httpPort int + var sshPort int + var logLevel int + + cmd := &cli.Command{ + Name: "dstack-runner", + Usage: "configure and start dstack-runner", + Version: Version, + Flags: []cli.Flag{ + &cli.IntFlag{ + Name: "log-level", + Value: 2, + DefaultText: "4 (Info)", + Usage: "log verbosity level: 2 (Error), 3 (Warning), 4 (Info), 5 (Debug), 6 (Trace)", + Destination: &logLevel, + }, + }, + Commands: []*cli.Command{ + { + Name: "start", + Usage: "Start dstack-runner", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "temp-dir", + Usage: "Temporary directory for logs and other files", + Value: consts.RunnerTempDir, + Destination: &tempDir, + TakesFile: true, + }, + &cli.StringFlag{ + Name: "home-dir", + Usage: "HomeDir directory for credentials and $HOME", + Value: consts.RunnerHomeDir, + Destination: &homeDir, + TakesFile: true, + }, + &cli.IntFlag{ + Name: "http-port", + Usage: "Set a http port", + Value: consts.RunnerHTTPPort, + Destination: &httpPort, + }, + &cli.IntFlag{ + Name: "ssh-port", + Usage: "Set the ssh port", + Value: consts.RunnerSSHPort, + Destination: &sshPort, + }, + }, + Action: func(cxt context.Context, cmd *cli.Command) error { + return start(cxt, tempDir, homeDir, httpPort, sshPort, logLevel, Version) + }, + }, + }, + } + + ctx := context.Background() + + if err := cmd.Run(ctx, os.Args); err != nil { + log.Error(ctx, err.Error()) + return 1 + } + + return 0 } -func start(tempDir string, homeDir string, httpPort int, sshPort int, logLevel int, version string) error { +func start(ctx context.Context, tempDir string, homeDir string, httpPort int, sshPort int, logLevel int, version string) error { if err := os.MkdirAll(tempDir, 0o755); err != nil { return fmt.Errorf("create temp directory: %w", err) } @@ -31,20 +103,20 @@ func start(tempDir string, homeDir string, httpPort int, sshPort int, logLevel i defer func() { closeErr := defaultLogFile.Close() if closeErr != nil { - log.Error(context.TODO(), "Failed to close default log file", "err", closeErr) + log.Error(ctx, "Failed to close default log file", "err", closeErr) } }() log.DefaultEntry.Logger.SetOutput(io.MultiWriter(os.Stdout, defaultLogFile)) log.DefaultEntry.Logger.SetLevel(logrus.Level(logLevel)) - server, err := api.NewServer(context.TODO(), tempDir, homeDir, fmt.Sprintf(":%d", httpPort), sshPort, version) + server, err := api.NewServer(ctx, tempDir, homeDir, fmt.Sprintf(":%d", httpPort), sshPort, version) if err != nil { return fmt.Errorf("create server: %w", err) } - log.Trace(context.TODO(), "Starting API server", "port", httpPort) - if err := server.Run(); err != nil { + log.Trace(ctx, "Starting API server", "port", httpPort) + if err := server.Run(ctx); err != nil { return fmt.Errorf("server failed: %w", err) } diff --git a/runner/go.mod b/runner/go.mod index b317f6c7b0..260fb880ae 100644 --- a/runner/go.mod +++ b/runner/go.mod @@ -20,7 +20,6 @@ require ( github.com/shirou/gopsutil/v4 v4.24.11 github.com/sirupsen/logrus v1.9.3 github.com/stretchr/testify v1.11.1 - github.com/urfave/cli/v2 v2.27.7 github.com/urfave/cli/v3 v3.6.1 golang.org/x/crypto v0.22.0 golang.org/x/sys v0.26.0 @@ -33,7 +32,6 @@ require ( github.com/bits-and-blooms/bitset v1.22.0 // indirect github.com/cloudflare/circl v1.3.7 // indirect github.com/containerd/log v0.1.0 // indirect - github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/cyphar/filepath-securejoin v0.2.4 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/distribution/reference v0.6.0 // indirect @@ -62,7 +60,6 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect - github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 // indirect github.com/skeema/knownhosts v1.2.2 // indirect github.com/tidwall/btree v1.7.0 // indirect @@ -70,7 +67,6 @@ require ( github.com/tklauser/numcpus v0.6.1 // indirect github.com/ulikunitz/xz v0.5.12 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect - github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.50.0 // indirect go.opentelemetry.io/otel v1.25.0 // indirect diff --git a/runner/go.sum b/runner/go.sum index de734fa39a..20c4568f9f 100644 --- a/runner/go.sum +++ b/runner/go.sum @@ -34,8 +34,6 @@ github.com/codeclysm/extract/v4 v4.0.0 h1:H87LFsUNaJTu2e/8p/oiuiUsOK/TaPQ5wxsjPn github.com/codeclysm/extract/v4 v4.0.0/go.mod h1:SFju1lj6as7FvUgalpSct7torJE0zttbJUWtryPRG6s= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= -github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3sHPnBo= -github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg= @@ -155,8 +153,6 @@ github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0leargg github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= -github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= -github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3 h1:n661drycOFuPLCN3Uc8sB6B/s6Z4t2xvBgU1htSHuq8= github.com/sergi/go-diff v1.3.2-0.20230802210424-5b0b94c5c0d3/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/shirou/gopsutil/v4 v4.24.11 h1:WaU9xqGFKvFfsUv94SXcUPD7rCkU0vr/asVdQOBZNj8= @@ -185,14 +181,10 @@ github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+F github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc= github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= -github.com/urfave/cli/v2 v2.27.7 h1:bH59vdhbjLv3LAvIu6gd0usJHgoTTPhCFib8qqOwXYU= -github.com/urfave/cli/v2 v2.27.7/go.mod h1:CyNAG/xg+iAOg0N4MPGZqVmv2rCoP267496AOXUZjA4= github.com/urfave/cli/v3 v3.6.1 h1:j8Qq8NyUawj/7rTYdBGrxcH7A/j7/G8Q5LhWEW4G3Mo= github.com/urfave/cli/v3 v3.6.1/go.mod h1:ysVLtOEmg2tOy6PknnYVhDoouyC/6N42TMeoMzskhso= github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM= github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw= -github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= -github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= diff --git a/runner/internal/runner/api/server.go b/runner/internal/runner/api/server.go index 9d98315b1b..c973f45e1a 100644 --- a/runner/internal/runner/api/server.go +++ b/runner/internal/runner/api/server.go @@ -4,6 +4,7 @@ import ( "context" "errors" "net/http" + _ "net/http/pprof" "os" "os/signal" "syscall" @@ -80,21 +81,21 @@ func NewServer(ctx context.Context, tempDir string, homeDir string, address stri return s, nil } -func (s *Server) Run() error { - signals := []os.Signal{os.Interrupt, syscall.SIGTERM, syscall.SIGKILL, syscall.SIGQUIT} +func (s *Server) Run(ctx context.Context) error { + signals := []os.Signal{os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT} signalCh := make(chan os.Signal, 1) go func() { if err := s.srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { - log.Error(context.TODO(), "Server failed", "err", err) + log.Error(ctx, "Server failed", "err", err) } }() - defer func() { _ = s.srv.Shutdown(context.TODO()) }() + defer func() { _ = s.srv.Shutdown(ctx) }() select { case <-s.jobBarrierCh: // job started case <-time.After(s.submitWaitDuration): - log.Error(context.TODO(), "Job didn't start in time, shutting down") + log.Error(ctx, "Job didn't start in time, shutting down") return errors.New("no job submitted") } @@ -103,10 +104,10 @@ func (s *Server) Run() error { signal.Notify(signalCh, signals...) select { case <-signalCh: - log.Error(context.TODO(), "Received interrupt signal, shutting down") + log.Error(ctx, "Received interrupt signal, shutting down") s.stop() case <-s.jobBarrierCh: - log.Info(context.TODO(), "Job finished, shutting down") + log.Info(ctx, "Job finished, shutting down") } close(s.shutdownCh) signal.Reset(signals...) @@ -123,9 +124,9 @@ loop: for _, ch := range logsToWait { select { case <-ch.ch: - log.Info(context.TODO(), "Logs streaming finished", "endpoint", ch.name) + log.Info(ctx, "Logs streaming finished", "endpoint", ch.name) case <-waitLogsDone: - log.Error(context.TODO(), "Logs streaming didn't finish in time") + log.Error(ctx, "Logs streaming didn't finish in time") break loop // break the loop, not the select } } From aec51dcdbd2f00f737ed5b81273203479cb4ff97 Mon Sep 17 00:00:00 2001 From: Oleg Date: Tue, 23 Dec 2025 13:16:21 +0300 Subject: [PATCH 015/187] Was implemented Event list for job, run and fleet (#3392) Was implemented Event list for job, run and fleet --- frontend/src/hooks/useInfiniteScroll.ts | 12 +- frontend/src/locale/en.json | 3 +- .../src/pages/Events/List/hooks/useFilters.ts | 37 +++-- .../src/pages/Fleets/Details/Events/index.tsx | 56 ++++++++ .../Fleets/Details/FleetDetails/index.tsx | 97 +++++++++++++ frontend/src/pages/Fleets/Details/index.tsx | 136 ++++++------------ .../pages/Fleets/Details/styles.module.scss | 18 +++ .../pages/Runs/Details/Events/List/index.tsx | 56 ++++++++ .../pages/Runs/Details/Jobs/Details/index.tsx | 10 ++ .../pages/Runs/Details/Jobs/Events/index.tsx | 78 ++++++++++ .../pages/Runs/Details/RunDetails/index.tsx | 3 + frontend/src/pages/Runs/Details/constants.ts | 6 + frontend/src/pages/Runs/Details/index.tsx | 12 +- frontend/src/pages/Runs/index.ts | 1 + frontend/src/router.tsx | 31 +++- frontend/src/routes.ts | 19 +++ 16 files changed, 458 insertions(+), 117 deletions(-) create mode 100644 frontend/src/pages/Fleets/Details/Events/index.tsx create mode 100644 frontend/src/pages/Fleets/Details/FleetDetails/index.tsx create mode 100644 frontend/src/pages/Fleets/Details/styles.module.scss create mode 100644 frontend/src/pages/Runs/Details/Events/List/index.tsx create mode 100644 frontend/src/pages/Runs/Details/Jobs/Events/index.tsx create mode 100644 frontend/src/pages/Runs/Details/constants.ts diff --git a/frontend/src/hooks/useInfiniteScroll.ts b/frontend/src/hooks/useInfiniteScroll.ts index 3a3813ff92..727586ab00 100644 --- a/frontend/src/hooks/useInfiniteScroll.ts +++ b/frontend/src/hooks/useInfiniteScroll.ts @@ -14,6 +14,7 @@ type UseInfinityParams = { useLazyQuery: UseLazyQuery, any>>; args: { limit?: number } & Args; getPaginationParams: (listItem: DataItem) => Partial; + skip?: boolean; // options?: UseQueryStateOptions, Record>; }; @@ -22,6 +23,7 @@ export const useInfiniteScroll = ({ getPaginationParams, // options, args, + skip, }: UseInfinityParams) => { const [data, setData] = useState>([]); const scrollElement = useRef(document.documentElement); @@ -55,14 +57,14 @@ export const useInfiniteScroll = ({ }; useEffect(() => { - if (!isEqual(argsProp, lastArgsProps.current)) { + if (!isEqual(argsProp, lastArgsProps.current) && !skip) { getEmptyList(); lastArgsProps.current = argsProp as Args; } - }, [argsProp, lastArgsProps]); + }, [argsProp, lastArgsProps, skip]); const getMore = async () => { - if (isLoadingRef.current || disabledMore) { + if (isLoadingRef.current || disabledMore || skip) { return; } @@ -83,7 +85,9 @@ export const useInfiniteScroll = ({ console.log(e); } - isLoadingRef.current = false; + setTimeout(() => { + isLoadingRef.current = false; + }, 10); }; useLayoutEffect(() => { diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index 3281ba8f4c..7c07a5f938 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -52,7 +52,8 @@ "refresh": "Refresh", "quickstart": "Quickstart", "ask_ai": "Ask AI", - "new": "New" + "new": "New", + "full_view": "Full view" }, "auth": { diff --git a/frontend/src/pages/Events/List/hooks/useFilters.ts b/frontend/src/pages/Events/List/hooks/useFilters.ts index 5ef714c763..56aa1f67df 100644 --- a/frontend/src/pages/Events/List/hooks/useFilters.ts +++ b/frontend/src/pages/Events/List/hooks/useFilters.ts @@ -54,7 +54,14 @@ const multipleChoiseKeys: RequestParamsKeys[] = [ 'actors', ]; -const targetTypes = ['project', 'user', 'fleet', 'instance', 'run', 'job']; +const targetTypes = [ + { label: 'Project', value: 'project' }, + { label: 'User', value: 'user' }, + { label: 'Fleet', value: 'fleet' }, + { label: 'Instance', value: 'instance' }, + { label: 'Run', value: 'run' }, + { label: 'Job', value: 'job' }, +]; export const useFilters = () => { const [searchParams, setSearchParams] = useSearchParams(); @@ -100,7 +107,7 @@ export const useFilters = () => { targetTypes?.forEach((targetType) => { options.push({ propertyKey: filterKeys.INCLUDE_TARGET_TYPES, - value: targetType, + value: targetType.label, }); }); @@ -117,53 +124,53 @@ export const useFilters = () => { { key: filterKeys.TARGET_PROJECTS, operators: ['='], - propertyLabel: 'Target Projects', + propertyLabel: 'Target projects', groupValuesLabel: 'Project ids', }, { key: filterKeys.TARGET_USERS, operators: ['='], - propertyLabel: 'Target Users', + propertyLabel: 'Target users', groupValuesLabel: 'Project ids', }, { key: filterKeys.TARGET_FLEETS, operators: ['='], - propertyLabel: 'Target Fleets', + propertyLabel: 'Target fleets', }, { key: filterKeys.TARGET_INSTANCES, operators: ['='], - propertyLabel: 'Target Instances', + propertyLabel: 'Target instances', }, { key: filterKeys.TARGET_RUNS, operators: ['='], - propertyLabel: 'Target Runs', + propertyLabel: 'Target runs', }, { key: filterKeys.TARGET_JOBS, operators: ['='], - propertyLabel: 'Target Jobs', + propertyLabel: 'Target jobs', }, { key: filterKeys.WITHIN_PROJECTS, operators: ['='], - propertyLabel: 'Within Projects', + propertyLabel: 'Within projects', groupValuesLabel: 'Project ids', }, { key: filterKeys.WITHIN_FLEETS, operators: ['='], - propertyLabel: 'Within Fleets', + propertyLabel: 'Within fleets', }, { key: filterKeys.WITHIN_RUNS, operators: ['='], - propertyLabel: 'Within Runs', + propertyLabel: 'Within runs', }, { @@ -240,6 +247,14 @@ export const useFilters = () => { ), } : {}), + + ...(params[filterKeys.INCLUDE_TARGET_TYPES] && Array.isArray(params[filterKeys.INCLUDE_TARGET_TYPES]) + ? { + [filterKeys.INCLUDE_TARGET_TYPES]: params[filterKeys.INCLUDE_TARGET_TYPES]?.map( + (selectedLabel: string) => targetTypes?.find(({ label }) => label === selectedLabel)?.['value'], + ), + } + : {}), }; return { diff --git a/frontend/src/pages/Fleets/Details/Events/index.tsx b/frontend/src/pages/Fleets/Details/Events/index.tsx new file mode 100644 index 0000000000..9a81c7dec3 --- /dev/null +++ b/frontend/src/pages/Fleets/Details/Events/index.tsx @@ -0,0 +1,56 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; +import Button from '@cloudscape-design/components/button'; + +import { Header, Loader, Table } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useCollection, useInfiniteScroll } from 'hooks'; +import { ROUTES } from 'routes'; +import { useLazyGetAllEventsQuery } from 'services/events'; + +import { useColumnsDefinitions } from 'pages/Events/List/hooks/useColumnDefinitions'; + +export const EventsList = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramFleetId = params.fleetId ?? ''; + const navigate = useNavigate(); + + const { data, isLoading, isLoadingMore } = useInfiniteScroll({ + useLazyQuery: useLazyGetAllEventsQuery, + args: { limit: DEFAULT_TABLE_PAGE_SIZE, within_fleets: [paramFleetId] }, + + getPaginationParams: (lastEvent) => ({ + prev_recorded_at: lastEvent.recorded_at, + prev_id: lastEvent.id, + }), + }); + + const { items, collectionProps } = useCollection(data, { + selection: {}, + }); + + const goToFullView = () => { + navigate(ROUTES.EVENTS.LIST + `?within_fleets=${paramFleetId}`); + }; + + const { columns } = useColumnsDefinitions(); + + return ( + {t('common.full_view')}}> + {t('navigation.events')} + + } + footer={} + /> + ); +}; diff --git a/frontend/src/pages/Fleets/Details/FleetDetails/index.tsx b/frontend/src/pages/Fleets/Details/FleetDetails/index.tsx new file mode 100644 index 0000000000..19d818c236 --- /dev/null +++ b/frontend/src/pages/Fleets/Details/FleetDetails/index.tsx @@ -0,0 +1,97 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; +import { format } from 'date-fns'; + +import { Box, ColumnLayout, Container, Header, Loader, NavigateLink, StatusIndicator } from 'components'; + +import { DATE_TIME_FORMAT } from 'consts'; +import { getFleetInstancesLinkText, getFleetPrice, getFleetStatusIconType } from 'libs/fleet'; +import { ROUTES } from 'routes'; +import { useGetFleetDetailsQuery } from 'services/fleet'; + +export const FleetDetails = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramFleetId = params.fleetId ?? ''; + const paramProjectName = params.projectName ?? ''; + + const { data, isLoading } = useGetFleetDetailsQuery( + { + projectName: paramProjectName, + fleetId: paramFleetId, + }, + { + refetchOnMountOrArgChange: true, + }, + ); + + const renderPrice = (fleet: IFleet) => { + const price = getFleetPrice(fleet); + + if (typeof price === 'number') return `$${price}`; + + return '-'; + }; + + return ( + <> + {isLoading && ( + + + + )} + + {data && ( + {t('common.general')}}> + +
+ {t('fleets.fleet')} +
{data.name}
+
+ +
+ {t('fleets.instances.status')} + +
+ + {t(`fleets.statuses.${data.status}`)} + +
+
+ +
+ {t('fleets.instances.project')} + +
+ + {data.project_name} + +
+
+ +
+ {t('fleets.instances.title')} + +
+ + {getFleetInstancesLinkText(data)} + +
+
+ +
+ {t('fleets.instances.started')} +
{format(new Date(data.created_at), DATE_TIME_FORMAT)}
+
+ +
+ {t('fleets.instances.price')} +
{renderPrice(data)}
+
+
+
+ )} + + ); +}; diff --git a/frontend/src/pages/Fleets/Details/index.tsx b/frontend/src/pages/Fleets/Details/index.tsx index e487f7a2c9..d3690fcff2 100644 --- a/frontend/src/pages/Fleets/Details/index.tsx +++ b/frontend/src/pages/Fleets/Details/index.tsx @@ -1,29 +1,22 @@ import React from 'react'; import { useTranslation } from 'react-i18next'; -import { useNavigate, useParams } from 'react-router-dom'; -import { format } from 'date-fns'; +import { Outlet, useNavigate, useParams } from 'react-router-dom'; -import { - Box, - Button, - ColumnLayout, - Container, - ContentLayout, - DetailsHeader, - Header, - Loader, - NavigateLink, - StatusIndicator, -} from 'components'; +import { Button, ContentLayout, DetailsHeader, Tabs } from 'components'; + +enum CodeTab { + Details = 'details', + Events = 'events', +} -import { DATE_TIME_FORMAT } from 'consts'; import { useBreadcrumbs } from 'hooks'; -import { getFleetInstancesLinkText, getFleetPrice, getFleetStatusIconType } from 'libs/fleet'; import { ROUTES } from 'routes'; import { useGetFleetDetailsQuery } from 'services/fleet'; import { useDeleteFleet } from '../List/useDeleteFleet'; +import styles from './styles.module.scss'; + export const FleetDetails: React.FC = () => { const { t } = useTranslation(); const params = useParams(); @@ -33,7 +26,7 @@ export const FleetDetails: React.FC = () => { const { deleteFleets, isDeleting } = useDeleteFleet(); - const { data, isLoading } = useGetFleetDetailsQuery( + const { data } = useGetFleetDetailsQuery( { projectName: paramProjectName, fleetId: paramFleetId, @@ -72,87 +65,42 @@ export const FleetDetails: React.FC = () => { .catch(console.log); }; - const renderPrice = (fleet: IFleet) => { - const price = getFleetPrice(fleet); - - if (typeof price === 'number') return `$${price}`; - - return '-'; - }; - const isDisabledDeleteButton = !data || isDeleting; return ( - - - - } +
+ + + + } + /> + } + > + - } - > - {isLoading && ( - - - - )} - - {data && ( - {t('common.general')}}> - -
- {t('fleets.fleet')} -
{data.name}
-
- -
- {t('fleets.instances.status')} - -
- - {t(`fleets.statuses.${data.status}`)} - -
-
- -
- {t('fleets.instances.project')} - -
- - {data.project_name} - -
-
- -
- {t('fleets.instances.title')} - -
- - {getFleetInstancesLinkText(data)} - -
-
- -
- {t('fleets.instances.started')} -
{format(new Date(data.created_at), DATE_TIME_FORMAT)}
-
-
- {t('fleets.instances.price')} -
{renderPrice(data)}
-
-
-
- )} -
+ + +
); }; diff --git a/frontend/src/pages/Fleets/Details/styles.module.scss b/frontend/src/pages/Fleets/Details/styles.module.scss new file mode 100644 index 0000000000..1a7d41a9c5 --- /dev/null +++ b/frontend/src/pages/Fleets/Details/styles.module.scss @@ -0,0 +1,18 @@ +.page { + height: 100%; + + & [class^="awsui_tabs-content"] { + display: none; + } + + & > [class^="awsui_layout"] { + height: 100%; + + & > [class^="awsui_content"] { + display: flex; + flex-direction: column; + gap: 20px; + height: 100%; + } + } +} diff --git a/frontend/src/pages/Runs/Details/Events/List/index.tsx b/frontend/src/pages/Runs/Details/Events/List/index.tsx new file mode 100644 index 0000000000..79ccb54436 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Events/List/index.tsx @@ -0,0 +1,56 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; +import Button from '@cloudscape-design/components/button'; + +import { Header, Loader, Table } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useCollection, useInfiniteScroll } from 'hooks'; +import { ROUTES } from 'routes'; +import { useLazyGetAllEventsQuery } from 'services/events'; + +import { useColumnsDefinitions } from 'pages/Events/List/hooks/useColumnDefinitions'; + +export const EventsList = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramRunId = params.runId ?? ''; + const navigate = useNavigate(); + + const { data, isLoading, isLoadingMore } = useInfiniteScroll({ + useLazyQuery: useLazyGetAllEventsQuery, + args: { limit: DEFAULT_TABLE_PAGE_SIZE, within_runs: [paramRunId] }, + + getPaginationParams: (lastEvent) => ({ + prev_recorded_at: lastEvent.recorded_at, + prev_id: lastEvent.id, + }), + }); + + const { items, collectionProps } = useCollection(data, { + selection: {}, + }); + + const goToFullView = () => { + navigate(ROUTES.EVENTS.LIST + `?within_runs=${paramRunId}`); + }; + + const { columns } = useColumnsDefinitions(); + + return ( +
{t('common.full_view')}}> + {t('navigation.events')} + + } + footer={} + /> + ); +}; diff --git a/frontend/src/pages/Runs/Details/Jobs/Details/index.tsx b/frontend/src/pages/Runs/Details/Jobs/Details/index.tsx index da44e7ea2c..ffdc2d460c 100644 --- a/frontend/src/pages/Runs/Details/Jobs/Details/index.tsx +++ b/frontend/src/pages/Runs/Details/Jobs/Details/index.tsx @@ -15,6 +15,7 @@ enum CodeTab { Details = 'details', Metrics = 'metrics', Logs = 'logs', + Events = 'Events', } export const JobDetailsPage: React.FC = () => { @@ -97,6 +98,15 @@ export const JobDetailsPage: React.FC = () => { paramJobName, ), }, + { + label: 'Events', + id: CodeTab.Events, + href: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.JOBS.DETAILS.EVENTS.FORMAT( + paramProjectName, + paramRunId, + paramJobName, + ), + }, ]} /> diff --git a/frontend/src/pages/Runs/Details/Jobs/Events/index.tsx b/frontend/src/pages/Runs/Details/Jobs/Events/index.tsx new file mode 100644 index 0000000000..48adc56364 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Jobs/Events/index.tsx @@ -0,0 +1,78 @@ +import React, { useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; +import Button from '@cloudscape-design/components/button'; + +import { Header, Loader, Table } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useCollection, useInfiniteScroll } from 'hooks'; +import { useLazyGetAllEventsQuery } from 'services/events'; + +import { useColumnsDefinitions } from 'pages/Events/List/hooks/useColumnDefinitions'; + +import { ROUTES } from '../../../../../routes'; +import { useGetRunQuery } from '../../../../../services/run'; + +export const EventsList = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramRunId = params.runId ?? ''; + const paramJobName = params.jobName ?? ''; + const navigate = useNavigate(); + + const { data: runData, isLoading: isLoadingRun } = useGetRunQuery({ + project_name: paramProjectName, + id: paramRunId, + }); + + const jobId = useMemo(() => { + if (!runData) return; + + return runData.jobs.find((job) => job.job_spec.job_name === paramJobName)?.job_submissions?.[0]?.id; + }, [runData]); + + const { data, isLoading, isLoadingMore } = useInfiniteScroll({ + useLazyQuery: useLazyGetAllEventsQuery, + args: { limit: DEFAULT_TABLE_PAGE_SIZE, target_jobs: jobId ? [jobId] : undefined }, + skip: !jobId, + + getPaginationParams: (lastEvent) => ({ + prev_recorded_at: lastEvent.recorded_at, + prev_id: lastEvent.id, + }), + }); + + const goToFullView = () => { + navigate(ROUTES.EVENTS.LIST + `?target_jobs=${jobId}`); + }; + + const { items, collectionProps } = useCollection(data, { + selection: {}, + }); + + const { columns } = useColumnsDefinitions(); + + return ( +
+ {t('common.full_view')} + + } + > + {t('navigation.events')} + + } + footer={} + /> + ); +}; diff --git a/frontend/src/pages/Runs/Details/RunDetails/index.tsx b/frontend/src/pages/Runs/Details/RunDetails/index.tsx index 1547fa8867..c00b2ce9d2 100644 --- a/frontend/src/pages/Runs/Details/RunDetails/index.tsx +++ b/frontend/src/pages/Runs/Details/RunDetails/index.tsx @@ -25,6 +25,7 @@ import { getRunListItemServiceUrl, getRunListItemSpot, } from '../../List/helpers'; +import { EventsList } from '../Events/List'; import { JobList } from '../Jobs/List'; import { ConnectToRunWithDevEnvConfiguration } from './ConnectToRunWithDevEnvConfiguration'; @@ -202,6 +203,8 @@ export const RunDetails = () => { runPriority={getRunPriority(runData)} /> )} + + {runData.jobs.length > 1 && } ); }; diff --git a/frontend/src/pages/Runs/Details/constants.ts b/frontend/src/pages/Runs/Details/constants.ts new file mode 100644 index 0000000000..1bf4bc69c0 --- /dev/null +++ b/frontend/src/pages/Runs/Details/constants.ts @@ -0,0 +1,6 @@ +export enum CodeTab { + Details = 'details', + Metrics = 'metrics', + Logs = 'logs', + Events = 'events', +} diff --git a/frontend/src/pages/Runs/Details/index.tsx b/frontend/src/pages/Runs/Details/index.tsx index f68c98fa17..78e9850c8e 100644 --- a/frontend/src/pages/Runs/Details/index.tsx +++ b/frontend/src/pages/Runs/Details/index.tsx @@ -15,15 +15,10 @@ import { isAvailableStoppingForRun, // isAvailableDeletingForRun, } from '../utils'; +import { CodeTab } from './constants'; import styles from './styles.module.scss'; -enum CodeTab { - Details = 'details', - Metrics = 'metrics', - Logs = 'logs', -} - export const RunDetailsPage: React.FC = () => { const { t } = useTranslation(); // const navigate = useNavigate(); @@ -189,6 +184,11 @@ export const RunDetailsPage: React.FC = () => { id: CodeTab.Metrics, href: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.METRICS.FORMAT(paramProjectName, paramRunId), }, + { + label: 'Events', + id: CodeTab.Events, + href: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.EVENTS.FORMAT(paramProjectName, paramRunId), + }, ]} /> )} diff --git a/frontend/src/pages/Runs/index.ts b/frontend/src/pages/Runs/index.ts index 5e30508fed..4e97fd2e09 100644 --- a/frontend/src/pages/Runs/index.ts +++ b/frontend/src/pages/Runs/index.ts @@ -2,6 +2,7 @@ export { RunList } from './List'; export { RunDetailsPage } from './Details'; export { RunDetails } from './Details/RunDetails'; export { JobMetrics } from './Details/Jobs/Metrics'; +export { EventsList } from './Details/Events/List'; export { JobLogs } from './Details/Logs'; export { Artifacts } from './Details/Artifacts'; export { CreateDevEnvironment } from './CreateDevEnvironment'; diff --git a/frontend/src/router.tsx b/frontend/src/router.tsx index 4a75bbf510..1bba4cb161 100644 --- a/frontend/src/router.tsx +++ b/frontend/src/router.tsx @@ -11,14 +11,25 @@ import { LoginByOktaCallback } from 'App/Login/LoginByOktaCallback'; import { TokenLogin } from 'App/Login/TokenLogin'; import { Logout } from 'App/Logout'; import { FleetDetails, FleetList } from 'pages/Fleets'; +import { EventsList as FleetEventsList } from 'pages/Fleets/Details/Events'; +import { FleetDetails as FleetDetailsGeneral } from 'pages/Fleets/Details/FleetDetails'; import { InstanceList } from 'pages/Instances'; import { ModelsList } from 'pages/Models'; import { ModelDetails } from 'pages/Models/Details'; import { CreateProjectWizard, ProjectAdd, ProjectDetails, ProjectList, ProjectSettings } from 'pages/Project'; import { BackendAdd, BackendEdit } from 'pages/Project/Backends'; import { AddGateway, EditGateway } from 'pages/Project/Gateways'; -import { CreateDevEnvironment, JobLogs, JobMetrics, RunDetails, RunDetailsPage, RunList } from 'pages/Runs'; +import { + CreateDevEnvironment, + EventsList as RunEvents, + JobLogs, + JobMetrics, + RunDetails, + RunDetailsPage, + RunList, +} from 'pages/Runs'; import { JobDetailsPage } from 'pages/Runs/Details/Jobs/Details'; +import { EventsList as JobEvents } from 'pages/Runs/Details/Jobs/Events'; import { CreditsHistoryAdd, UserAdd, UserDetails, UserEdit, UserList } from 'pages/User'; import { UserBilling, UserProjects, UserSettings } from 'pages/User/Details'; @@ -107,6 +118,10 @@ export const router = createBrowserRouter([ path: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.LOGS.TEMPLATE, element: , }, + { + path: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.EVENTS.TEMPLATE, + element: , + }, ], }, { @@ -125,6 +140,10 @@ export const router = createBrowserRouter([ path: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.JOBS.DETAILS.LOGS.TEMPLATE, element: , }, + { + path: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.JOBS.DETAILS.EVENTS.TEMPLATE, + element: , + }, ], }, @@ -180,6 +199,16 @@ export const router = createBrowserRouter([ { path: ROUTES.FLEETS.DETAILS.TEMPLATE, element: , + children: [ + { + index: true, + element: , + }, + { + path: ROUTES.FLEETS.DETAILS.EVENTS.TEMPLATE, + element: , + }, + ], }, // Instances diff --git a/frontend/src/routes.ts b/frontend/src/routes.ts index b591af5f67..6bc1fb0e5a 100644 --- a/frontend/src/routes.ts +++ b/frontend/src/routes.ts @@ -33,6 +33,11 @@ export const ROUTES = { FORMAT: (projectName: string, runId: string) => buildRoute(ROUTES.PROJECT.DETAILS.RUNS.DETAILS.METRICS.TEMPLATE, { projectName, runId }), }, + EVENTS: { + TEMPLATE: `/projects/:projectName/runs/:runId/events`, + FORMAT: (projectName: string, runId: string) => + buildRoute(ROUTES.PROJECT.DETAILS.RUNS.DETAILS.EVENTS.TEMPLATE, { projectName, runId }), + }, LOGS: { TEMPLATE: `/projects/:projectName/runs/:runId/logs`, FORMAT: (projectName: string, runId: string) => @@ -65,6 +70,15 @@ export const ROUTES = { jobName, }), }, + EVENTS: { + TEMPLATE: `/projects/:projectName/runs/:runId/jobs/:jobName/events`, + FORMAT: (projectName: string, runId: string, jobName: string) => + buildRoute(ROUTES.PROJECT.DETAILS.RUNS.DETAILS.JOBS.DETAILS.EVENTS.TEMPLATE, { + projectName, + runId, + jobName, + }), + }, }, }, }, @@ -122,6 +136,11 @@ export const ROUTES = { TEMPLATE: `/projects/:projectName/fleets/:fleetId`, FORMAT: (projectName: string, fleetId: string) => buildRoute(ROUTES.FLEETS.DETAILS.TEMPLATE, { projectName, fleetId }), + EVENTS: { + TEMPLATE: `/projects/:projectName/fleets/:fleetId/events`, + FORMAT: (projectName: string, fleetId: string) => + buildRoute(ROUTES.FLEETS.DETAILS.EVENTS.TEMPLATE, { projectName, fleetId }), + }, }, }, From 12d182ebc5a45b412fc86a607a7da1049ad12651 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Tue, 23 Dec 2025 23:10:29 +0000 Subject: [PATCH 016/187] Fix event target type rendering in server logs (#3414) Before: ``` Emitting event: Project deleted. Event targets: EventTargetType.PROJECT(65dce4)test ``` After: ``` Emitting event: Project deleted. Event targets: project(65dce4)test ``` The issue was only reproducible on [Python 3.11+](https://peps.python.org/pep-0663/) --- src/dstack/_internal/server/services/events.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dstack/_internal/server/services/events.py b/src/dstack/_internal/server/services/events.py index 58037863eb..7a4d355237 100644 --- a/src/dstack/_internal/server/services/events.py +++ b/src/dstack/_internal/server/services/events.py @@ -138,7 +138,7 @@ def from_model( raise ValueError(f"Unsupported model type: {type(model)}") def fmt(self) -> str: - return fmt_entity(self.type, self.id, self.name) + return fmt_entity(self.type.value, self.id, self.name) def emit(session: AsyncSession, message: str, actor: AnyActor, targets: list[Target]) -> None: @@ -389,7 +389,7 @@ async def list_events( def event_model_to_event(event_model: EventModel) -> Event: targets = [ EventTarget( - type=target.entity_type, + type=target.entity_type.value, project_id=target.entity_project_id, project_name=target.entity_project.name if target.entity_project else None, id=target.entity_id, From 237ddd15bc4dfc5904d58405a4168d19203d1afc Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Wed, 24 Dec 2025 08:41:48 +0000 Subject: [PATCH 017/187] Support `gateway: true` in service configurations (#3413) Allow setting `gateway: true` in service configurations, which will enforce that the service runs on the default gateway or is rejected if one isn't available. **Compatibility**: pre-0.20.1 clients will not be able to list (`dstack ps`), get, or update (`dstack apply`) services that were created by 0.20.1+ clients with the `gateway: true` property. --- .../_internal/core/models/configurations.py | 13 ++---------- .../server/services/services/__init__.py | 4 ++++ .../_internal/server/routers/test_runs.py | 20 ++++++++++++++++--- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 158c59b341..9c44155564 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -725,7 +725,8 @@ class ServiceConfigurationParams(CoreModel): Field( description=( "The name of the gateway. Specify boolean `false` to run without a gateway." - " Omit to run with the default gateway" + " Specify boolean `true` to run with the default gateway." + " Omit to run with the default gateway if there is one, or without a gateway otherwise" ), ), ] = None @@ -795,16 +796,6 @@ def convert_replicas(cls, v: Range[int]) -> Range[int]: raise ValueError("The minimum number of replicas must be greater than or equal to 0") return v - @validator("gateway") - def validate_gateway( - cls, v: Optional[Union[bool, str]] - ) -> Optional[Union[Literal[False], str]]: - if v == True: - raise ValueError( - "The `gateway` property must be a string or boolean `false`, not boolean `true`" - ) - return v - @root_validator() def validate_scaling(cls, values): scaling = values.get("scaling") diff --git a/src/dstack/_internal/server/services/services/__init__.py b/src/dstack/_internal/server/services/services/__init__.py index 05c1fa9097..39e8e98c6a 100644 --- a/src/dstack/_internal/server/services/services/__init__.py +++ b/src/dstack/_internal/server/services/services/__init__.py @@ -55,6 +55,10 @@ async def register_service(session: AsyncSession, run_model: RunModel, run_spec: gateway = await get_project_default_gateway_model( session=session, project=run_model.project ) + if gateway is None and run_spec.configuration.gateway == True: + raise ResourceNotExistsError( + "The service requires a gateway, but there is no default gateway in the project" + ) if gateway is not None: service_spec = await _register_service_in_gateway(session, run_model, run_spec, gateway) diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 77dada59af..5f5037c79d 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -2013,6 +2013,13 @@ def mock_gateway_connections(self) -> Generator[None, None, None]: "https://gateway.default-gateway.example", id="submits-to-default-gateway", ), + pytest.param( + [("default-gateway", True), ("non-default-gateway", False)], + True, + "https://test-service.default-gateway.example", + "https://gateway.default-gateway.example", + id="submits-to-default-gateway-when-gateway-true", + ), pytest.param( [("default-gateway", True), ("non-default-gateway", False)], "non-default-gateway", @@ -2108,7 +2115,7 @@ async def test_return_error_if_specified_gateway_not_exists( } @pytest.mark.asyncio - async def test_return_error_if_specified_gateway_is_true( + async def test_return_error_if_specified_gateway_is_true_and_no_gateway_exists( self, test_db, session: AsyncSession, client: AsyncClient ) -> None: user = await create_user(session=session, global_role=GlobalRole.USER) @@ -2123,5 +2130,12 @@ async def test_return_error_if_specified_gateway_is_true( headers=get_auth_headers(user.token), json={"run_spec": run_spec}, ) - assert response.status_code == 422 - assert "must be a string or boolean `false`, not boolean `true`" in response.text + assert response.status_code == 400 + assert response.json() == { + "detail": [ + { + "msg": "The service requires a gateway, but there is no default gateway in the project", + "code": "resource_not_exists", + } + ] + } From d11b893676cf66e741e0e541375644dab49ed5e8 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Wed, 24 Dec 2025 15:19:34 +0500 Subject: [PATCH 018/187] Implement `dstack login` command and CLI OAuth flow (#3415) * Add common OAuth schemas * Add /api/auth/list_providers * Make state a struct * Add /api/auth/get_next_redirect * Implement localhost redirect from UI for github * Implement localhost redirect from UI for all providers * Prototype dstack login * Autoconfigure CLI projects * Minor fixes * Use webbrowser.open * Implement provider selection * Refactor server code into _LoginServer * Test dstack login * Add tests * Normalize server url * Minor adjustments * Document dstack login * Document register_provider * Disable control chars for tests --- docs/docs/reference/cli/dstack/login.md | 17 ++ .../EntraID/LoginByEntraIDCallback/index.tsx | 22 +- .../App/Login/LoginByGithubCallback/index.tsx | 37 +-- .../App/Login/LoginByGoogleCallback/index.tsx | 22 +- .../App/Login/LoginByOktaCallback/index.tsx | 22 +- frontend/src/api.ts | 1 + frontend/src/services/auth.ts | 9 + mkdocs.yml | 235 ++++++++--------- pyproject.toml | 5 +- pytest.ini | 2 + src/dstack/_internal/cli/commands/login.py | 237 ++++++++++++++++++ src/dstack/_internal/cli/main.py | 2 + src/dstack/_internal/cli/utils/common.py | 5 +- src/dstack/_internal/core/models/auth.py | 28 +++ src/dstack/_internal/server/app.py | 2 + src/dstack/_internal/server/routers/auth.py | 34 +++ .../_internal/server/routers/projects.py | 11 +- src/dstack/_internal/server/schemas/auth.py | 83 ++++++ .../_internal/server/schemas/projects.py | 6 + src/dstack/_internal/server/services/auth.py | 77 ++++++ .../_internal/server/services/projects.py | 12 +- src/dstack/_internal/settings.py | 3 + src/dstack/api/server/__init__.py | 15 +- src/dstack/api/server/_auth.py | 30 +++ src/dstack/api/server/_projects.py | 6 +- .../_internal/cli/commands/test_login.py | 103 ++++++++ src/tests/_internal/cli/common.py | 13 +- .../_internal/server/routers/test_auth.py | 64 +++++ 28 files changed, 937 insertions(+), 166 deletions(-) create mode 100644 docs/docs/reference/cli/dstack/login.md create mode 100644 src/dstack/_internal/cli/commands/login.py create mode 100644 src/dstack/_internal/core/models/auth.py create mode 100644 src/dstack/_internal/server/routers/auth.py create mode 100644 src/dstack/_internal/server/schemas/auth.py create mode 100644 src/dstack/_internal/server/services/auth.py create mode 100644 src/dstack/api/server/_auth.py create mode 100644 src/tests/_internal/cli/commands/test_login.py create mode 100644 src/tests/_internal/server/routers/test_auth.py diff --git a/docs/docs/reference/cli/dstack/login.md b/docs/docs/reference/cli/dstack/login.md new file mode 100644 index 0000000000..d608476e27 --- /dev/null +++ b/docs/docs/reference/cli/dstack/login.md @@ -0,0 +1,17 @@ +# dstack login + +This command authorizes the CLI using Single Sign-On and automatically configures your projects. +It provides an alternative to `dstack project add`. + +## Usage + +
+ +```shell +$ dstack login --help +#GENERATE# +``` + +
+ +[//]: # (TODO: Provide examples) diff --git a/frontend/src/App/Login/EntraID/LoginByEntraIDCallback/index.tsx b/frontend/src/App/Login/EntraID/LoginByEntraIDCallback/index.tsx index aa70d00797..036851c3cf 100644 --- a/frontend/src/App/Login/EntraID/LoginByEntraIDCallback/index.tsx +++ b/frontend/src/App/Login/EntraID/LoginByEntraIDCallback/index.tsx @@ -7,7 +7,7 @@ import { UnauthorizedLayout } from 'layouts/UnauthorizedLayout'; import { useAppDispatch } from 'hooks'; import { ROUTES } from 'routes'; -import { useEntraCallbackMutation } from 'services/auth'; +import { useEntraCallbackMutation, useGetNextRedirectMutation } from 'services/auth'; import { AuthErrorMessage } from 'App/AuthErrorMessage'; import { getBaseUrl } from 'App/helpers'; @@ -23,15 +23,27 @@ export const LoginByEntraIDCallback: React.FC = () => { const [isInvalidCode, setIsInvalidCode] = useState(false); const dispatch = useAppDispatch(); + const [getNextRedirect] = useGetNextRedirectMutation(); const [entraCallback] = useEntraCallbackMutation(); const checkCode = () => { if (code && state) { - entraCallback({ code, state, base_url: getBaseUrl() }) + getNextRedirect({ code, state }) .unwrap() - .then(({ creds: { token } }) => { - dispatch(setAuthData({ token })); - navigate('/'); + .then(({ redirect_url }) => { + if (redirect_url) { + window.location.href = redirect_url; + return; + } + entraCallback({ code, state, base_url: getBaseUrl() }) + .unwrap() + .then(({ creds: { token } }) => { + dispatch(setAuthData({ token })); + navigate('/'); + }) + .catch(() => { + setIsInvalidCode(true); + }); }) .catch(() => { setIsInvalidCode(true); diff --git a/frontend/src/App/Login/LoginByGithubCallback/index.tsx b/frontend/src/App/Login/LoginByGithubCallback/index.tsx index 27d5a755a7..af88aa72f1 100644 --- a/frontend/src/App/Login/LoginByGithubCallback/index.tsx +++ b/frontend/src/App/Login/LoginByGithubCallback/index.tsx @@ -7,7 +7,7 @@ import { UnauthorizedLayout } from 'layouts/UnauthorizedLayout'; import { useAppDispatch } from 'hooks'; import { ROUTES } from 'routes'; -import { useGithubCallbackMutation } from 'services/auth'; +import { useGetNextRedirectMutation, useGithubCallbackMutation } from 'services/auth'; import { useLazyGetProjectsQuery } from 'services/project'; import { AuthErrorMessage } from 'App/AuthErrorMessage'; @@ -23,26 +23,35 @@ export const LoginByGithubCallback: React.FC = () => { const [isInvalidCode, setIsInvalidCode] = useState(false); const dispatch = useAppDispatch(); + const [getNextRedirect] = useGetNextRedirectMutation(); const [githubCallback] = useGithubCallbackMutation(); const [getProjects] = useLazyGetProjectsQuery(); const checkCode = () => { if (code && state) { - githubCallback({ code, state }) + getNextRedirect({ code: code, state: state }) .unwrap() - .then(async ({ creds: { token } }) => { - dispatch(setAuthData({ token })); - - if (process.env.UI_VERSION === 'sky') { - const result = await getProjects().unwrap(); - - if (result?.length === 0) { - navigate(ROUTES.PROJECT.ADD); - return; - } + .then(async ({ redirect_url }) => { + if (redirect_url) { + window.location.href = redirect_url; + return; } - - navigate('/'); + githubCallback({ code, state }) + .unwrap() + .then(async ({ creds: { token } }) => { + dispatch(setAuthData({ token })); + if (process.env.UI_VERSION === 'sky') { + const result = await getProjects().unwrap(); + if (result?.length === 0) { + navigate(ROUTES.PROJECT.ADD); + return; + } + } + navigate('/'); + }) + .catch(() => { + setIsInvalidCode(true); + }); }) .catch(() => { setIsInvalidCode(true); diff --git a/frontend/src/App/Login/LoginByGoogleCallback/index.tsx b/frontend/src/App/Login/LoginByGoogleCallback/index.tsx index 465d0be3ee..4f95f94e27 100644 --- a/frontend/src/App/Login/LoginByGoogleCallback/index.tsx +++ b/frontend/src/App/Login/LoginByGoogleCallback/index.tsx @@ -7,7 +7,7 @@ import { UnauthorizedLayout } from 'layouts/UnauthorizedLayout'; import { useAppDispatch } from 'hooks'; import { ROUTES } from 'routes'; -import { useGoogleCallbackMutation } from 'services/auth'; +import { useGetNextRedirectMutation, useGoogleCallbackMutation } from 'services/auth'; import { AuthErrorMessage } from 'App/AuthErrorMessage'; import { Loading } from 'App/Loading'; @@ -22,15 +22,27 @@ export const LoginByGoogleCallback: React.FC = () => { const [isInvalidCode, setIsInvalidCode] = useState(false); const dispatch = useAppDispatch(); + const [getNextRedirect] = useGetNextRedirectMutation(); const [googleCallback] = useGoogleCallbackMutation(); const checkCode = () => { if (code && state) { - googleCallback({ code, state }) + getNextRedirect({ code, state }) .unwrap() - .then(({ creds: { token } }) => { - dispatch(setAuthData({ token })); - navigate('/'); + .then(({ redirect_url }) => { + if (redirect_url) { + window.location.href = redirect_url; + return; + } + googleCallback({ code, state }) + .unwrap() + .then(({ creds: { token } }) => { + dispatch(setAuthData({ token })); + navigate('/'); + }) + .catch(() => { + setIsInvalidCode(true); + }); }) .catch(() => { setIsInvalidCode(true); diff --git a/frontend/src/App/Login/LoginByOktaCallback/index.tsx b/frontend/src/App/Login/LoginByOktaCallback/index.tsx index ccc9fbc749..72cdc96185 100644 --- a/frontend/src/App/Login/LoginByOktaCallback/index.tsx +++ b/frontend/src/App/Login/LoginByOktaCallback/index.tsx @@ -7,7 +7,7 @@ import { UnauthorizedLayout } from 'layouts/UnauthorizedLayout'; import { useAppDispatch } from 'hooks'; import { ROUTES } from 'routes'; -import { useOktaCallbackMutation } from 'services/auth'; +import { useGetNextRedirectMutation, useOktaCallbackMutation } from 'services/auth'; import { AuthErrorMessage } from 'App/AuthErrorMessage'; import { Loading } from 'App/Loading'; @@ -22,15 +22,27 @@ export const LoginByOktaCallback: React.FC = () => { const [isInvalidCode, setIsInvalidCode] = useState(false); const dispatch = useAppDispatch(); + const [getNextRedirect] = useGetNextRedirectMutation(); const [oktaCallback] = useOktaCallbackMutation(); const checkCode = () => { if (code && state) { - oktaCallback({ code, state }) + getNextRedirect({ code, state }) .unwrap() - .then(({ creds: { token } }) => { - dispatch(setAuthData({ token })); - navigate('/'); + .then(({ redirect_url }) => { + if (redirect_url) { + window.location.href = redirect_url; + return; + } + oktaCallback({ code, state }) + .unwrap() + .then(({ creds: { token } }) => { + dispatch(setAuthData({ token })); + navigate('/'); + }) + .catch(() => { + setIsInvalidCode(true); + }); }) .catch(() => { setIsInvalidCode(true); diff --git a/frontend/src/api.ts b/frontend/src/api.ts index 2dea526601..262aa46b75 100644 --- a/frontend/src/api.ts +++ b/frontend/src/api.ts @@ -5,6 +5,7 @@ export const API = { AUTH: { BASE: () => `${API.BASE()}/auth`, + NEXT_REDIRECT: () => `${API.AUTH.BASE()}/get_next_redirect`, GITHUB: { BASE: () => `${API.AUTH.BASE()}/github`, AUTHORIZE: () => `${API.AUTH.GITHUB.BASE()}/authorize`, diff --git a/frontend/src/services/auth.ts b/frontend/src/services/auth.ts index f65892911a..2512ed0a7d 100644 --- a/frontend/src/services/auth.ts +++ b/frontend/src/services/auth.ts @@ -12,6 +12,14 @@ export const authApi = createApi({ tagTypes: ['Auth'], endpoints: (builder) => ({ + getNextRedirect: builder.mutation<{ redirect_url?: string }, { code: string; state: string }>({ + query: (body) => ({ + url: API.AUTH.NEXT_REDIRECT(), + method: 'POST', + body, + }), + }), + githubAuthorize: builder.mutation<{ authorization_url: string }, void>({ query: () => ({ url: API.AUTH.GITHUB.AUTHORIZE(), @@ -103,6 +111,7 @@ export const authApi = createApi({ }); export const { + useGetNextRedirectMutation, useGithubAuthorizeMutation, useGithubCallbackMutation, useGetOktaInfoQuery, diff --git a/mkdocs.yml b/mkdocs.yml index e793bd23c2..74939703e3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -112,67 +112,67 @@ plugins: background_color: "black" color: "#FFFFFF" font_family: "Roboto" -# debug: true + # debug: true cards_layout_dir: docs/layouts cards_layout: custom - search - redirects: redirect_maps: - 'blog/2024/02/08/resources-authentication-and-more.md': 'https://github.com/dstackai/dstack/releases/0.15.0' - 'blog/2024/01/19/openai-endpoints-preview.md': 'https://github.com/dstackai/dstack/releases/0.14.0' - 'blog/2023/12/22/disk-size-cuda-12-1-mixtral-and-more.md': 'https://github.com/dstackai/dstack/releases/0.13.0' - 'blog/2023/11/21/vastai.md': 'https://github.com/dstackai/dstack/releases/0.12.3' - 'blog/2023/10/31/tensordock.md': 'https://github.com/dstackai/dstack/releases/0.12.2' - 'blog/2023/10/18/simplified-cloud-setup.md': 'https://github.com/dstackai/dstack/releases/0.12.0' - 'blog/2023/08/22/multiple-clouds.md': 'https://github.com/dstackai/dstack/releases/0.11' - 'blog/2023/08/07/services-preview.md': 'https://github.com/dstackai/dstack/releases/0.10.7' - 'blog/2023/07/14/lambda-cloud-ga-and-docker-support.md': 'https://github.com/dstackai/dstack/releases/0.10.5' - 'blog/2023/05/22/azure-support-better-ui-and-more.md': 'https://github.com/dstackai/dstack/releases/0.9.1' - 'blog/2023/03/13/gcp-support-just-landed.md': 'https://github.com/dstackai/dstack/releases/0.2' - 'blog/dstack-research.md': 'https://dstack.ai/#get-started' - 'docs/dev-environments.md': 'docs/concepts/dev-environments.md' - 'docs/tasks.md': 'docs/concepts/tasks.md' - 'docs/services.md': 'docs/concepts/services.md' - 'docs/fleets.md': 'docs/concepts/fleets.md' - 'docs/examples/llms/llama31.md': 'examples/llms/llama/index.md' - 'docs/examples/llms/llama32.md': 'examples/llms/llama/index.md' - 'examples/llms/llama31/index.md': 'examples/llms/llama/index.md' - 'examples/llms/llama32/index.md': 'examples/llms/llama/index.md' - 'docs/examples/accelerators/amd/index.md': 'examples/accelerators/amd/index.md' - 'docs/examples/deployment/nim/index.md': 'examples/inference/nim/index.md' - 'docs/examples/deployment/vllm/index.md': 'examples/inference/vllm/index.md' - 'docs/examples/deployment/tgi/index.md': 'examples/inference/tgi/index.md' - 'providers.md': 'partners.md' - 'backends.md': 'partners.md' - 'blog/monitoring-gpu-usage.md': 'blog/posts/dstack-metrics.md' - 'blog/inactive-dev-environments-auto-shutdown.md': 'blog/posts/inactivity-duration.md' - 'blog/data-centers-and-private-clouds.md': 'blog/posts/gpu-blocks-and-proxy-jump.md' - 'blog/distributed-training-with-aws-efa.md': 'examples/clusters/aws/index.md' - 'blog/dstack-stats.md': 'blog/posts/dstack-metrics.md' - 'docs/concepts/metrics.md': 'docs/guides/metrics.md' - 'docs/guides/monitoring.md': 'docs/guides/metrics.md' - 'blog/nvidia-and-amd-on-vultr.md.md': 'blog/posts/nvidia-and-amd-on-vultr.md' - 'examples/misc/nccl-tests/index.md': 'examples/clusters/nccl-rccl-tests/index.md' - 'examples/misc/a3high-clusters/index.md': 'examples/clusters/gcp/index.md' - 'examples/misc/a3mega-clusters/index.md': 'examples/clusters/gcp/index.md' - 'examples/distributed-training/nccl-tests/index.md': 'examples/clusters/nccl-rccl-tests/index.md' - 'examples/distributed-training/rccl-tests/index.md': 'examples/clusters/nccl-rccl-tests/index.md' - 'examples/deployment/nim/index.md': 'examples/inference/nim/index.md' - 'examples/deployment/vllm/index.md': 'examples/inference/vllm/index.md' - 'examples/deployment/tgi/index.md': 'examples/inference/tgi/index.md' - 'examples/deployment/sglang/index.md': 'examples/inference/sglang/index.md' - 'examples/deployment/trtllm/index.md': 'examples/inference/trtllm/index.md' - 'examples/fine-tuning/trl/index.md': 'examples/single-node-training/trl/index.md' - 'examples/fine-tuning/axolotl/index.md': 'examples/single-node-training/axolotl/index.md' - 'blog/efa.md': 'examples/clusters/aws/index.md' - 'docs/concepts/repos.md': 'docs/concepts/dev-environments.md#repos' - 'examples/clusters/a3high/index.md': 'examples/clusters/gcp/index.md' - 'examples/clusters/a3mega/index.md': 'examples/clusters/gcp/index.md' - 'examples/clusters/a4/index.md': 'examples/clusters/gcp/index.md' - 'examples/clusters/efa/index.md': 'examples/clusters/aws/index.md' + "blog/2024/02/08/resources-authentication-and-more.md": "https://github.com/dstackai/dstack/releases/0.15.0" + "blog/2024/01/19/openai-endpoints-preview.md": "https://github.com/dstackai/dstack/releases/0.14.0" + "blog/2023/12/22/disk-size-cuda-12-1-mixtral-and-more.md": "https://github.com/dstackai/dstack/releases/0.13.0" + "blog/2023/11/21/vastai.md": "https://github.com/dstackai/dstack/releases/0.12.3" + "blog/2023/10/31/tensordock.md": "https://github.com/dstackai/dstack/releases/0.12.2" + "blog/2023/10/18/simplified-cloud-setup.md": "https://github.com/dstackai/dstack/releases/0.12.0" + "blog/2023/08/22/multiple-clouds.md": "https://github.com/dstackai/dstack/releases/0.11" + "blog/2023/08/07/services-preview.md": "https://github.com/dstackai/dstack/releases/0.10.7" + "blog/2023/07/14/lambda-cloud-ga-and-docker-support.md": "https://github.com/dstackai/dstack/releases/0.10.5" + "blog/2023/05/22/azure-support-better-ui-and-more.md": "https://github.com/dstackai/dstack/releases/0.9.1" + "blog/2023/03/13/gcp-support-just-landed.md": "https://github.com/dstackai/dstack/releases/0.2" + "blog/dstack-research.md": "https://dstack.ai/#get-started" + "docs/dev-environments.md": "docs/concepts/dev-environments.md" + "docs/tasks.md": "docs/concepts/tasks.md" + "docs/services.md": "docs/concepts/services.md" + "docs/fleets.md": "docs/concepts/fleets.md" + "docs/examples/llms/llama31.md": "examples/llms/llama/index.md" + "docs/examples/llms/llama32.md": "examples/llms/llama/index.md" + "examples/llms/llama31/index.md": "examples/llms/llama/index.md" + "examples/llms/llama32/index.md": "examples/llms/llama/index.md" + "docs/examples/accelerators/amd/index.md": "examples/accelerators/amd/index.md" + "docs/examples/deployment/nim/index.md": "examples/inference/nim/index.md" + "docs/examples/deployment/vllm/index.md": "examples/inference/vllm/index.md" + "docs/examples/deployment/tgi/index.md": "examples/inference/tgi/index.md" + "providers.md": "partners.md" + "backends.md": "partners.md" + "blog/monitoring-gpu-usage.md": "blog/posts/dstack-metrics.md" + "blog/inactive-dev-environments-auto-shutdown.md": "blog/posts/inactivity-duration.md" + "blog/data-centers-and-private-clouds.md": "blog/posts/gpu-blocks-and-proxy-jump.md" + "blog/distributed-training-with-aws-efa.md": "examples/clusters/aws/index.md" + "blog/dstack-stats.md": "blog/posts/dstack-metrics.md" + "docs/concepts/metrics.md": "docs/guides/metrics.md" + "docs/guides/monitoring.md": "docs/guides/metrics.md" + "blog/nvidia-and-amd-on-vultr.md.md": "blog/posts/nvidia-and-amd-on-vultr.md" + "examples/misc/nccl-tests/index.md": "examples/clusters/nccl-rccl-tests/index.md" + "examples/misc/a3high-clusters/index.md": "examples/clusters/gcp/index.md" + "examples/misc/a3mega-clusters/index.md": "examples/clusters/gcp/index.md" + "examples/distributed-training/nccl-tests/index.md": "examples/clusters/nccl-rccl-tests/index.md" + "examples/distributed-training/rccl-tests/index.md": "examples/clusters/nccl-rccl-tests/index.md" + "examples/deployment/nim/index.md": "examples/inference/nim/index.md" + "examples/deployment/vllm/index.md": "examples/inference/vllm/index.md" + "examples/deployment/tgi/index.md": "examples/inference/tgi/index.md" + "examples/deployment/sglang/index.md": "examples/inference/sglang/index.md" + "examples/deployment/trtllm/index.md": "examples/inference/trtllm/index.md" + "examples/fine-tuning/trl/index.md": "examples/single-node-training/trl/index.md" + "examples/fine-tuning/axolotl/index.md": "examples/single-node-training/axolotl/index.md" + "blog/efa.md": "examples/clusters/aws/index.md" + "docs/concepts/repos.md": "docs/concepts/dev-environments.md#repos" + "examples/clusters/a3high/index.md": "examples/clusters/gcp/index.md" + "examples/clusters/a3mega/index.md": "examples/clusters/gcp/index.md" + "examples/clusters/a4/index.md": "examples/clusters/gcp/index.md" + "examples/clusters/efa/index.md": "examples/clusters/aws/index.md" - typeset - gen-files: - scripts: # always relative to mkdocs.yml + scripts: # always relative to mkdocs.yml - scripts/docs/gen_examples.py - scripts/docs/gen_cli_reference.py - scripts/docs/gen_openapi_reference.py @@ -279,70 +279,71 @@ nav: - Protips: docs/guides/protips.md - Migration: docs/guides/migration.md - Reference: - - .dstack.yml: - - dev-environment: docs/reference/dstack.yml/dev-environment.md - - task: docs/reference/dstack.yml/task.md - - service: docs/reference/dstack.yml/service.md - - fleet: docs/reference/dstack.yml/fleet.md - - gateway: docs/reference/dstack.yml/gateway.md - - volume: docs/reference/dstack.yml/volume.md - - server/config.yml: docs/reference/server/config.yml.md - - CLI: - - dstack server: docs/reference/cli/dstack/server.md - - dstack init: docs/reference/cli/dstack/init.md - - dstack apply: docs/reference/cli/dstack/apply.md - - dstack delete: docs/reference/cli/dstack/delete.md - - dstack ps: docs/reference/cli/dstack/ps.md - - dstack stop: docs/reference/cli/dstack/stop.md - - dstack attach: docs/reference/cli/dstack/attach.md - - dstack logs: docs/reference/cli/dstack/logs.md - - dstack metrics: docs/reference/cli/dstack/metrics.md - - dstack event: docs/reference/cli/dstack/event.md - - dstack project: docs/reference/cli/dstack/project.md - - dstack fleet: docs/reference/cli/dstack/fleet.md - - dstack offer: docs/reference/cli/dstack/offer.md - - dstack volume: docs/reference/cli/dstack/volume.md - - dstack gateway: docs/reference/cli/dstack/gateway.md - - dstack secret: docs/reference/cli/dstack/secret.md - - API: - - Python API: docs/reference/api/python/index.md - - REST API: docs/reference/api/rest/index.md - - Environment variables: docs/reference/environment-variables.md - - .dstack/profiles.yml: docs/reference/profiles.yml.md - - Plugins: - - Python API: docs/reference/plugins/python/index.md - - REST API: docs/reference/plugins/rest/index.md - - llms-full.txt: https://dstack.ai/llms-full.txt + - .dstack.yml: + - dev-environment: docs/reference/dstack.yml/dev-environment.md + - task: docs/reference/dstack.yml/task.md + - service: docs/reference/dstack.yml/service.md + - fleet: docs/reference/dstack.yml/fleet.md + - gateway: docs/reference/dstack.yml/gateway.md + - volume: docs/reference/dstack.yml/volume.md + - server/config.yml: docs/reference/server/config.yml.md + - CLI: + - dstack server: docs/reference/cli/dstack/server.md + - dstack init: docs/reference/cli/dstack/init.md + - dstack apply: docs/reference/cli/dstack/apply.md + - dstack delete: docs/reference/cli/dstack/delete.md + - dstack ps: docs/reference/cli/dstack/ps.md + - dstack stop: docs/reference/cli/dstack/stop.md + - dstack attach: docs/reference/cli/dstack/attach.md + - dstack login: docs/reference/cli/dstack/login.md + - dstack logs: docs/reference/cli/dstack/logs.md + - dstack metrics: docs/reference/cli/dstack/metrics.md + - dstack event: docs/reference/cli/dstack/event.md + - dstack project: docs/reference/cli/dstack/project.md + - dstack fleet: docs/reference/cli/dstack/fleet.md + - dstack offer: docs/reference/cli/dstack/offer.md + - dstack volume: docs/reference/cli/dstack/volume.md + - dstack gateway: docs/reference/cli/dstack/gateway.md + - dstack secret: docs/reference/cli/dstack/secret.md + - API: + - Python API: docs/reference/api/python/index.md + - REST API: docs/reference/api/rest/index.md + - Environment variables: docs/reference/environment-variables.md + - .dstack/profiles.yml: docs/reference/profiles.yml.md + - Plugins: + - Python API: docs/reference/plugins/python/index.md + - REST API: docs/reference/plugins/rest/index.md + - llms-full.txt: https://dstack.ai/llms-full.txt - Examples: - - examples.md - - Single-node training: - - TRL: examples/single-node-training/trl/index.md - - Axolotl: examples/single-node-training/axolotl/index.md - - Distributed training: - - TRL: examples/distributed-training/trl/index.md - - Axolotl: examples/distributed-training/axolotl/index.md - - Ray+RAGEN: examples/distributed-training/ray-ragen/index.md - - Clusters: - - AWS: examples/clusters/aws/index.md - - GCP: examples/clusters/gcp/index.md - - Lambda: examples/clusters/lambda/index.md - - Crusoe: examples/clusters/crusoe/index.md - - NCCL/RCCL tests: examples/clusters/nccl-rccl-tests/index.md - - Inference: - - SGLang: examples/inference/sglang/index.md - - vLLM: examples/inference/vllm/index.md - - TGI: examples/inference/tgi/index.md - - NIM: examples/inference/nim/index.md - - TensorRT-LLM: examples/inference/trtllm/index.md - - Accelerators: - - AMD: examples/accelerators/amd/index.md - - TPU: examples/accelerators/tpu/index.md - - Intel Gaudi: examples/accelerators/intel/index.md - - Tenstorrent: examples/accelerators/tenstorrent/index.md - - Models: - - Wan2.2: examples/models/wan22/index.md - - Blog: - - blog/index.md + - examples.md + - Single-node training: + - TRL: examples/single-node-training/trl/index.md + - Axolotl: examples/single-node-training/axolotl/index.md + - Distributed training: + - TRL: examples/distributed-training/trl/index.md + - Axolotl: examples/distributed-training/axolotl/index.md + - Ray+RAGEN: examples/distributed-training/ray-ragen/index.md + - Clusters: + - AWS: examples/clusters/aws/index.md + - GCP: examples/clusters/gcp/index.md + - Lambda: examples/clusters/lambda/index.md + - Crusoe: examples/clusters/crusoe/index.md + - NCCL/RCCL tests: examples/clusters/nccl-rccl-tests/index.md + - Inference: + - SGLang: examples/inference/sglang/index.md + - vLLM: examples/inference/vllm/index.md + - TGI: examples/inference/tgi/index.md + - NIM: examples/inference/nim/index.md + - TensorRT-LLM: examples/inference/trtllm/index.md + - Accelerators: + - AMD: examples/accelerators/amd/index.md + - TPU: examples/accelerators/tpu/index.md + - Intel Gaudi: examples/accelerators/intel/index.md + - Tenstorrent: examples/accelerators/tenstorrent/index.md + - Models: + - Wan2.2: examples/models/wan22/index.md + - Blog: + - blog/index.md - Case studies: blog/case-studies.md - Benchmarks: blog/benchmarks.md # - Discord: https://discord.gg/u8SmfwPpMd" target="_blank diff --git a/pyproject.toml b/pyproject.toml index e69ec4d5aa..e540705d93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,11 +100,12 @@ ignore = [ dev = [ "httpx>=0.28.1", "pre-commit>=4.2.0", + "pytest~=7.2", "pytest-asyncio>=0.23.8", "pytest-httpbin>=2.1.0", - "httpbin>=0.10.2", # indirect to make compatible with Werkzeug 3 - "pytest~=7.2", "pytest-socket>=0.7.0", + "pytest-env>=1.1.0", + "httpbin>=0.10.2", # indirect to make compatible with Werkzeug 3 "requests-mock>=1.12.1", "openai>=1.68.2", "freezegun>=1.5.1", diff --git a/pytest.ini b/pytest.ini index 899f67a61b..30c0e62811 100644 --- a/pytest.ini +++ b/pytest.ini @@ -8,3 +8,5 @@ addopts = markers = shim_version dockerized +env = + DSTACK_CLI_RICH_FORCE_TERMINAL=0 diff --git a/src/dstack/_internal/cli/commands/login.py b/src/dstack/_internal/cli/commands/login.py new file mode 100644 index 0000000000..54fdc0a0b6 --- /dev/null +++ b/src/dstack/_internal/cli/commands/login.py @@ -0,0 +1,237 @@ +import argparse +import queue +import threading +import urllib.parse +import webbrowser +from http.server import BaseHTTPRequestHandler, HTTPServer +from typing import Optional + +from dstack._internal.cli.commands import BaseCommand +from dstack._internal.cli.utils.common import console +from dstack._internal.core.errors import ClientError, CLIError +from dstack._internal.core.models.users import UserWithCreds +from dstack.api._public.runs import ConfigManager +from dstack.api.server import APIClient + + +class LoginCommand(BaseCommand): + NAME = "login" + DESCRIPTION = "Authorize the CLI using Single Sign-On" + + def _register(self): + super()._register() + self._parser.add_argument( + "--url", + help="The server URL, e.g. https://sky.dstack.ai", + required=True, + ) + self._parser.add_argument( + "-p", + "--provider", + help=( + "The SSO provider name." + " Selected automatically if the server supports only one provider." + ), + ) + + def _command(self, args: argparse.Namespace): + super()._command(args) + base_url = _normalize_url_or_error(args.url) + api_client = APIClient(base_url=base_url) + provider = self._select_provider_or_error(api_client=api_client, provider=args.provider) + server = _LoginServer(api_client=api_client, provider=provider) + try: + server.start() + auth_resp = api_client.auth.authorize(provider=provider, local_port=server.port) + opened = webbrowser.open(auth_resp.authorization_url) + if opened: + console.print( + f"Your browser has been opened to log in with [code]{provider.title()}[/]:\n" + ) + else: + console.print(f"Open the URL to log in with [code]{provider.title()}[/]:\n") + print(f"{auth_resp.authorization_url}\n") + user = server.get_logged_in_user() + finally: + server.shutdown() + if user is None: + raise CLIError("CLI authentication failed") + console.print(f"Logged in as [code]{user.username}[/].") + api_client = APIClient(base_url=base_url, token=user.creds.token) + self._configure_projects(api_client=api_client, user=user) + + def _select_provider_or_error(self, api_client: APIClient, provider: Optional[str]) -> str: + providers = api_client.auth.list_providers() + available_providers = [p.name for p in providers if p.enabled] + if len(available_providers) == 0: + raise CLIError("No SSO providers configured on the server.") + if provider is None: + if len(available_providers) > 1: + raise CLIError( + "Specify -p/--provider to choose SSO provider" + f" Available providers: {', '.join(available_providers)}" + ) + return available_providers[0] + if provider not in available_providers: + raise CLIError( + f"Provider {provider} not configured on the server." + f" Available providers: {', '.join(available_providers)}" + ) + return provider + + def _configure_projects(self, api_client: APIClient, user: UserWithCreds): + projects = api_client.projects.list(include_not_joined=False) + if len(projects) == 0: + console.print( + "No projects configured." + " Create your own project via the UI or contact a project manager to add you to the project." + ) + return + config_manager = ConfigManager() + default_project = config_manager.get_project_config() + new_default_project = None + for i, project in enumerate(projects): + set_as_default = ( + default_project is None + and i == 0 + or default_project is not None + and default_project.name == project.project_name + ) + if set_as_default: + new_default_project = project + config_manager.configure_project( + name=project.project_name, + url=api_client.base_url, + token=user.creds.token, + default=set_as_default, + ) + config_manager.save() + console.print( + f"Configured projects: {', '.join(f'[code]{p.project_name}[/]' for p in projects)}." + ) + if new_default_project: + console.print( + f"Set project [code]{new_default_project.project_name}[/] as default project." + ) + + +class _BadRequestError(Exception): + pass + + +class _LoginServer: + def __init__(self, api_client: APIClient, provider: str): + self._api_client = api_client + self._provider = provider + self._result_queue: queue.Queue[Optional[UserWithCreds]] = queue.Queue() + # Using built-in HTTP server to avoid extra deps. + callback_handler = self._make_callback_handler( + result_queue=self._result_queue, + api_client=api_client, + provider=provider, + ) + self._server = self._create_server(handler=callback_handler) + + def start(self): + self._thread = threading.Thread(target=self._server.serve_forever) + self._thread.start() + + def shutdown(self): + self._server.shutdown() + + def get_logged_in_user(self) -> Optional[UserWithCreds]: + return self._result_queue.get() + + @property + def port(self) -> int: + return self._server.server_port + + def _make_callback_handler( + self, + result_queue: queue.Queue[Optional[UserWithCreds]], + api_client: APIClient, + provider: str, + ) -> type[BaseHTTPRequestHandler]: + class _CallbackHandler(BaseHTTPRequestHandler): + def do_GET(self): + parsed_path = urllib.parse.urlparse(self.path) + if parsed_path.path != "/auth/callback": + self.send_response(404) + self.end_headers() + return + try: + self._handle_auth_callback(parsed_path) + except _BadRequestError as e: + self.send_error(400, e.args[0]) + result_queue.put(None) + + def log_message(self, format: str, *args): + # Do not log server requests. + pass + + def _handle_auth_callback(self, parsed_path: urllib.parse.ParseResult): + try: + params = urllib.parse.parse_qs(parsed_path.query, strict_parsing=True) + except ValueError: + raise _BadRequestError("Bad query params") + code = params.get("code", [None])[0] + state = params.get("state", [None])[0] + if code is None or state is None: + raise _BadRequestError("Missing required params") + try: + user = api_client.auth.callback(provider=provider, code=code, state=state) + except ClientError: + raise _BadRequestError("Authentication failed") + self._send_success_html() + result_queue.put(user) + + def _send_success_html(self): + body = _SUCCESS_HTML.encode() + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + return _CallbackHandler + + def _create_server(self, handler: type[BaseHTTPRequestHandler]) -> HTTPServer: + server_address = ("127.0.0.1", 0) + server = HTTPServer(server_address, handler) + return server + + +def _normalize_url_or_error(url: str) -> str: + if not url.startswith("http://") and not url.startswith("https://"): + url = "http://" + url + parsed = urllib.parse.urlparse(url) + if ( + not parsed.scheme + or not parsed.hostname + or parsed.path not in ("", "/") + or parsed.params + or parsed.query + or parsed.fragment + or (parsed.port is not None and not (1 <= parsed.port <= 65535)) + ): + raise CLIError("Invalid server URL format. Format: --url https://sky.dstack.ai") + return url + + +_SUCCESS_HTML = """\ + + + + + CLI authenticated + + + +

dstack CLI authenticated

+

You may close this page.

+ + +""" diff --git a/src/dstack/_internal/cli/main.py b/src/dstack/_internal/cli/main.py index 98be45b8d5..61f3967ab7 100644 --- a/src/dstack/_internal/cli/main.py +++ b/src/dstack/_internal/cli/main.py @@ -12,6 +12,7 @@ from dstack._internal.cli.commands.fleet import FleetCommand from dstack._internal.cli.commands.gateway import GatewayCommand from dstack._internal.cli.commands.init import InitCommand +from dstack._internal.cli.commands.login import LoginCommand from dstack._internal.cli.commands.logs import LogsCommand from dstack._internal.cli.commands.metrics import MetricsCommand from dstack._internal.cli.commands.offer import OfferCommand @@ -68,6 +69,7 @@ def main(): GatewayCommand.register(subparsers) InitCommand.register(subparsers) OfferCommand.register(subparsers) + LoginCommand.register(subparsers) LogsCommand.register(subparsers) MetricsCommand.register(subparsers) ProjectCommand.register(subparsers) diff --git a/src/dstack/_internal/cli/utils/common.py b/src/dstack/_internal/cli/utils/common.py index c75f08b81b..87f0687e1b 100644 --- a/src/dstack/_internal/cli/utils/common.py +++ b/src/dstack/_internal/cli/utils/common.py @@ -21,7 +21,10 @@ "code": "bold sea_green3", } -console = Console(theme=Theme(_colors)) +console = Console( + theme=Theme(_colors), + force_terminal=settings.CLI_RICH_FORCE_TERMINAL, +) LIVE_TABLE_REFRESH_RATE_PER_SEC = 1 diff --git a/src/dstack/_internal/core/models/auth.py b/src/dstack/_internal/core/models/auth.py new file mode 100644 index 0000000000..f6d09fbc73 --- /dev/null +++ b/src/dstack/_internal/core/models/auth.py @@ -0,0 +1,28 @@ +from typing import Annotated, Optional + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class OAuthProviderInfo(CoreModel): + name: Annotated[str, Field(description="The OAuth2 provider name.")] + enabled: Annotated[ + bool, Field(description="Whether the provider is configured on the server.") + ] + + +class OAuthState(CoreModel): + """ + A struct that the server puts in the OAuth2 state parameter. + """ + + value: Annotated[str, Field(description="A random string to protect against CSRF.")] + local_port: Annotated[ + Optional[int], + Field( + description="If specified, the user is redirected to localhost:local_port after the redirect from the provider.", + ge=1, + le=65535, + ), + ] = None diff --git a/src/dstack/_internal/server/app.py b/src/dstack/_internal/server/app.py index 9c83bac793..527dd128fe 100644 --- a/src/dstack/_internal/server/app.py +++ b/src/dstack/_internal/server/app.py @@ -25,6 +25,7 @@ from dstack._internal.server.background.tasks.process_probes import PROBES_SCHEDULER from dstack._internal.server.db import get_db, get_session_ctx, migrate from dstack._internal.server.routers import ( + auth, backends, events, files, @@ -210,6 +211,7 @@ def add_no_api_version_check_routes(paths: List[str]): def register_routes(app: FastAPI, ui: bool = True): app.include_router(server.router) app.include_router(users.router) + app.include_router(auth.router) app.include_router(projects.router) app.include_router(backends.root_router) app.include_router(backends.project_router) diff --git a/src/dstack/_internal/server/routers/auth.py b/src/dstack/_internal/server/routers/auth.py new file mode 100644 index 0000000000..89fe2f57f5 --- /dev/null +++ b/src/dstack/_internal/server/routers/auth.py @@ -0,0 +1,34 @@ +from fastapi import APIRouter + +from dstack._internal.core.models.auth import OAuthProviderInfo +from dstack._internal.server.schemas.auth import ( + OAuthGetNextRedirectRequest, + OAuthGetNextRedirectResponse, +) +from dstack._internal.server.services import auth as auth_services +from dstack._internal.server.utils.routers import CustomORJSONResponse + +router = APIRouter(prefix="/api/auth", tags=["auth"]) + + +@router.post("/list_providers", response_model=list[OAuthProviderInfo]) +async def list_providers(): + """ + Returns OAuth2 providers registered on the server. + """ + return CustomORJSONResponse(auth_services.list_providers()) + + +@router.post("/get_next_redirect", response_model=OAuthGetNextRedirectResponse) +async def get_next_redirect(body: OAuthGetNextRedirectRequest): + """ + A helper endpoint that returns the next redirect URL in case the state encodes it. + Can be used by the UI after the redirect from the provider + to determine if the user needs to be redirected further (CLI login) + or the auth callback endpoint needs to be called directly (UI login). + """ + return CustomORJSONResponse( + OAuthGetNextRedirectResponse( + redirect_url=auth_services.get_next_redirect_url(code=body.code, state=body.state) + ) + ) diff --git a/src/dstack/_internal/server/routers/projects.py b/src/dstack/_internal/server/routers/projects.py index 56d41b6ca0..d35b9535e8 100644 --- a/src/dstack/_internal/server/routers/projects.py +++ b/src/dstack/_internal/server/routers/projects.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from typing import List, Optional, Tuple from fastapi import APIRouter, Depends from sqlalchemy.ext.asyncio import AsyncSession @@ -10,6 +10,7 @@ AddProjectMemberRequest, CreateProjectRequest, DeleteProjectsRequest, + ListProjectsRequest, RemoveProjectMemberRequest, SetProjectMembersRequest, UpdateProjectRequest, @@ -37,6 +38,7 @@ @router.post("/list", response_model=List[Project]) async def list_projects( + body: Optional[ListProjectsRequest] = None, session: AsyncSession = Depends(get_session), user: UserModel = Depends(Authenticated()), ): @@ -45,8 +47,13 @@ async def list_projects( `members` and `backends` are always empty - call `/api/projects/{project_name}/get` to retrieve them. """ + if body is None: + # For backward compatibility + body = ListProjectsRequest() return CustomORJSONResponse( - await projects.list_user_accessible_projects(session=session, user=user) + await projects.list_user_accessible_projects( + session=session, user=user, include_not_joined=body.include_not_joined + ) ) diff --git a/src/dstack/_internal/server/schemas/auth.py b/src/dstack/_internal/server/schemas/auth.py new file mode 100644 index 0000000000..942f1fb388 --- /dev/null +++ b/src/dstack/_internal/server/schemas/auth.py @@ -0,0 +1,83 @@ +from typing import Annotated, Optional + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class OAuthInfoResponse(CoreModel): + enabled: Annotated[ + bool, Field(description="Whether the OAuth2 provider is configured on the server.") + ] + + +class OAuthAuthorizeRequest(CoreModel): + local_port: Annotated[ + Optional[int], + Field( + description="If specified, the user is redirected to localhost:local_port after the redirect from the provider.", + ge=1, + le=65535, + ), + ] = None + base_url: Annotated[ + Optional[str], + Field( + description=( + "The server base URL used to access the dstack server, e.g. `http://localhost:3000`." + " Used to build redirect URLs when the dstack server is available on multiple domains." + ) + ), + ] = None + + +class OAuthAuthorizeResponse(CoreModel): + authorization_url: Annotated[str, Field(description="An OAuth2 authorization URL.")] + + +class OAuthCallbackRequest(CoreModel): + code: Annotated[ + str, + Field( + description="The OAuth2 authorization code received from the provider in the redirect URL." + ), + ] + state: Annotated[ + str, + Field(description="The state parameter received from the provider in the redirect URL."), + ] + base_url: Annotated[ + Optional[str], + Field( + description=( + "The server base URL used to access the dstack server, e.g. `http://localhost:3000`." + " Used to build redirect URLs when the dstack server is available on multiple domains." + " It must match the base URL specified when generating the authorization URL." + ) + ), + ] = None + + +class OAuthGetNextRedirectRequest(CoreModel): + code: Annotated[ + str, + Field( + description="The OAuth2 authorization code received from the provider in the redirect URL." + ), + ] + state: Annotated[ + str, + Field(description="The state parameter received from the provider in the redirect URL."), + ] + + +class OAuthGetNextRedirectResponse(CoreModel): + redirect_url: Annotated[ + Optional[str], + Field( + description=( + "The URL that the user needs to be redirected to." + " If `null`, there is no next redirect." + ) + ), + ] diff --git a/src/dstack/_internal/server/schemas/projects.py b/src/dstack/_internal/server/schemas/projects.py index 355bb3a770..ec05c1fb47 100644 --- a/src/dstack/_internal/server/schemas/projects.py +++ b/src/dstack/_internal/server/schemas/projects.py @@ -6,6 +6,12 @@ from dstack._internal.core.models.users import ProjectRole +class ListProjectsRequest(CoreModel): + include_not_joined: Annotated[ + bool, Field(description="Include public projects where user is not a member") + ] = True + + class CreateProjectRequest(CoreModel): project_name: str is_public: bool = False diff --git a/src/dstack/_internal/server/services/auth.py b/src/dstack/_internal/server/services/auth.py new file mode 100644 index 0000000000..8ea40994f3 --- /dev/null +++ b/src/dstack/_internal/server/services/auth.py @@ -0,0 +1,77 @@ +import secrets +import urllib.parse +from base64 import b64decode, b64encode +from typing import Optional + +from fastapi import Request, Response + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.auth import OAuthProviderInfo, OAuthState +from dstack._internal.server import settings +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +_OAUTH_STATE_COOKIE_KEY = "oauth-state" + +_OAUTH_PROVIDERS: list[OAuthProviderInfo] = [] + + +def register_provider(provider_info: OAuthProviderInfo): + """ + Registers an OAuth2 provider supported on the server. + If the provider is supported but not configured, it should be registered with `enabled=False`. + The provider must register endpoints `/api/auth/{provider}/authorize` and `/api/auth/{provider}/callback` + as defined by the client (see `dstack.api.server._auth.AuthAPIClient`). + """ + _OAUTH_PROVIDERS.append(provider_info) + + +def list_providers() -> list[OAuthProviderInfo]: + return _OAUTH_PROVIDERS + + +def generate_oauth_state(local_port: Optional[int] = None) -> str: + value = str(secrets.token_hex(16)) + state = OAuthState(value=value, local_port=local_port) + return b64encode(state.json().encode()).decode() + + +def set_state_cookie(response: Response, state: str): + response.set_cookie( + key=_OAUTH_STATE_COOKIE_KEY, + value=state, + secure=settings.SERVER_URL.startswith("https://"), + samesite="strict", + httponly=True, + ) + + +def get_validated_state(request: Request, state: str) -> OAuthState: + state_cookie = request.cookies.get(_OAUTH_STATE_COOKIE_KEY) + if state != state_cookie: + raise ServerClientError("Invalid state token") + decoded_state = _decode_state(state) + if decoded_state is None: + raise ServerClientError("Invalid state token") + return decoded_state + + +def get_next_redirect_url(code: str, state: str) -> Optional[str]: + decoded_state = _decode_state(state) + if decoded_state is None: + raise ServerClientError("Invalid state token") + if decoded_state.local_port is None: + return None + params = {"code": code, "state": state} + redirect_url = f"http://localhost:{decoded_state.local_port}/auth/callback?{urllib.parse.urlencode(params)}" + return redirect_url + + +def _decode_state(state: str) -> Optional[OAuthState]: + try: + return OAuthState.parse_raw(b64decode(state, validate=True).decode()) + except Exception as e: + logger.debug("Exception when decoding OAuth2 state parameter: %s", repr(e)) + return None diff --git a/src/dstack/_internal/server/services/projects.py b/src/dstack/_internal/server/services/projects.py index 5e4842df56..3ef6c32785 100644 --- a/src/dstack/_internal/server/services/projects.py +++ b/src/dstack/_internal/server/services/projects.py @@ -83,18 +83,22 @@ async def list_user_projects( async def list_user_accessible_projects( session: AsyncSession, user: UserModel, + include_not_joined: bool, ) -> List[Project]: """ Returns all projects accessible to the user: - Projects where user is a member (public or private) - - Public projects where user is NOT a member + - if `include_not_joined`: Public projects where user is NOT a member """ if user.global_role == GlobalRole.ADMIN: projects = await list_project_models(session=session) else: - member_projects = await list_member_project_models(session=session, user=user) - public_projects = await list_public_non_member_project_models(session=session, user=user) - projects = member_projects + public_projects + projects = await list_member_project_models(session=session, user=user) + if include_not_joined: + public_projects = await list_public_non_member_project_models( + session=session, user=user + ) + projects += public_projects projects = sorted(projects, key=lambda p: p.created_at) return [ diff --git a/src/dstack/_internal/settings.py b/src/dstack/_internal/settings.py index 81682480a2..6089e37c07 100644 --- a/src/dstack/_internal/settings.py +++ b/src/dstack/_internal/settings.py @@ -1,6 +1,7 @@ import os from dstack import version +from dstack._internal.utils.env import environ from dstack._internal.utils.version import parse_version DSTACK_VERSION = os.getenv("DSTACK_VERSION", version.__version__) @@ -28,6 +29,8 @@ CLI_LOG_LEVEL = os.getenv("DSTACK_CLI_LOG_LEVEL", "INFO").upper() CLI_FILE_LOG_LEVEL = os.getenv("DSTACK_CLI_FILE_LOG_LEVEL", "DEBUG").upper() +# Can be used to disable control characters (e.g. for testing). +CLI_RICH_FORCE_TERMINAL = environ.get_bool("DSTACK_CLI_RICH_FORCE_TERMINAL") # Development settings diff --git a/src/dstack/api/server/__init__.py b/src/dstack/api/server/__init__.py index 2ad94f0864..5d6ea08604 100644 --- a/src/dstack/api/server/__init__.py +++ b/src/dstack/api/server/__init__.py @@ -14,6 +14,7 @@ URLNotFoundError, ) from dstack._internal.utils.logging import get_logger +from dstack.api.server._auth import AuthAPIClient from dstack.api.server._backends import BackendsAPIClient from dstack.api.server._events import EventsAPIClient from dstack.api.server._files import FilesAPIClient @@ -52,16 +53,18 @@ class APIClient: files: operations with files """ - def __init__(self, base_url: str, token: str): + def __init__(self, base_url: str, token: Optional[str] = None): """ Args: base_url: The API endpoints prefix, e.g. `http://127.0.0.1:3000/`. token: The API token. """ self._base_url = base_url.rstrip("/") - self._token = token self._s = requests.session() - self._s.headers.update({"Authorization": f"Bearer {token}"}) + self._token = None + if token is not None: + self._token = token + self._s.headers.update({"Authorization": f"Bearer {token}"}) client_api_version = os.getenv("DSTACK_CLIENT_API_VERSION", version.__version__) if client_api_version is not None: self._s.headers.update({"X-API-VERSION": client_api_version}) @@ -71,6 +74,10 @@ def __init__(self, base_url: str, token: str): def base_url(self) -> str: return self._base_url + @property + def auth(self) -> AuthAPIClient: + return AuthAPIClient(self._request, self._logger) + @property def users(self) -> UsersAPIClient: return UsersAPIClient(self._request, self._logger) @@ -128,6 +135,8 @@ def events(self) -> EventsAPIClient: return EventsAPIClient(self._request, self._logger) def get_token_hash(self) -> str: + if self._token is None: + raise ValueError("Token not set") return hashlib.sha1(self._token.encode()).hexdigest()[:8] def _request( diff --git a/src/dstack/api/server/_auth.py b/src/dstack/api/server/_auth.py new file mode 100644 index 0000000000..b944a292a2 --- /dev/null +++ b/src/dstack/api/server/_auth.py @@ -0,0 +1,30 @@ +from typing import Optional + +from pydantic import parse_obj_as + +from dstack._internal.core.models.auth import OAuthProviderInfo +from dstack._internal.core.models.users import UserWithCreds +from dstack._internal.server.schemas.auth import ( + OAuthAuthorizeRequest, + OAuthAuthorizeResponse, + OAuthCallbackRequest, +) +from dstack.api.server._group import APIClientGroup + + +class AuthAPIClient(APIClientGroup): + def list_providers(self) -> list[OAuthProviderInfo]: + resp = self._request("/api/auth/list_providers") + return parse_obj_as(list[OAuthProviderInfo.__response__], resp.json()) + + def authorize(self, provider: str, local_port: Optional[int] = None) -> OAuthAuthorizeResponse: + body = OAuthAuthorizeRequest(local_port=local_port) + resp = self._request(f"/api/auth/{provider}/authorize", body=body.json()) + return parse_obj_as(OAuthAuthorizeResponse.__response__, resp.json()) + + def callback( + self, provider: str, code: str, state: str, base_url: Optional[str] = None + ) -> UserWithCreds: + body = OAuthCallbackRequest(code=code, state=state, base_url=base_url) + resp = self._request(f"/api/auth/{provider}/callback", body=body.json()) + return parse_obj_as(UserWithCreds.__response__, resp.json()) diff --git a/src/dstack/api/server/_projects.py b/src/dstack/api/server/_projects.py index 0fb47c9ab5..31bdc3b2de 100644 --- a/src/dstack/api/server/_projects.py +++ b/src/dstack/api/server/_projects.py @@ -8,6 +8,7 @@ AddProjectMemberRequest, CreateProjectRequest, DeleteProjectsRequest, + ListProjectsRequest, MemberSetting, RemoveProjectMemberRequest, SetProjectMembersRequest, @@ -16,8 +17,9 @@ class ProjectsAPIClient(APIClientGroup): - def list(self) -> List[Project]: - resp = self._request("/api/projects/list") + def list(self, include_not_joined: bool = True) -> List[Project]: + body = ListProjectsRequest(include_not_joined=include_not_joined) + resp = self._request("/api/projects/list", body=body.json()) return parse_obj_as(List[Project.__response__], resp.json()) def create(self, project_name: str, is_public: bool = False) -> Project: diff --git a/src/tests/_internal/cli/commands/test_login.py b/src/tests/_internal/cli/commands/test_login.py new file mode 100644 index 0000000000..42b46c2b73 --- /dev/null +++ b/src/tests/_internal/cli/commands/test_login.py @@ -0,0 +1,103 @@ +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import call, patch + +from pytest import CaptureFixture + +from tests._internal.cli.common import run_dstack_cli + + +class TestLogin: + def test_login_no_projects(self, capsys: CaptureFixture, tmp_path: Path): + with ( + patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, + patch("dstack._internal.cli.commands.login.APIClient") as APIClientMock, + patch("dstack._internal.cli.commands.login._LoginServer") as LoginServerMock, + ): + webbrowser_mock.open.return_value = True + APIClientMock.return_value.auth.list_providers.return_value = [ + SimpleNamespace(name="github", enabled=True) + ] + APIClientMock.return_value.auth.authorize.return_value = SimpleNamespace( + authorization_url="http://auth_url" + ) + APIClientMock.return_value.projects.list.return_value = [] + user = SimpleNamespace(username="me", creds=SimpleNamespace(token="token")) + LoginServerMock.return_value.get_logged_in_user.return_value = user + exit_code = run_dstack_cli( + [ + "login", + "--url", + "http://127.0.0.1:31313", + "--provider", + "github", + ], + home_dir=tmp_path, + ) + + assert exit_code == 0 + assert capsys.readouterr().out.replace("\n", "") == ( + "Your browser has been opened to log in with Github:" + "http://auth_url" + "Logged in as me." + "No projects configured. Create your own project via the UI or contact a project manager to add you to the project." + ) + + def test_login_configures_projects(self, capsys: CaptureFixture, tmp_path: Path): + with ( + patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, + patch("dstack._internal.cli.commands.login.APIClient") as APIClientMock, + patch("dstack._internal.cli.commands.login.ConfigManager") as ConfigManagerMock, + patch("dstack._internal.cli.commands.login._LoginServer") as LoginServerMock, + ): + webbrowser_mock.open.return_value = True + APIClientMock.return_value.auth.list_providers.return_value = [ + SimpleNamespace(name="github", enabled=True) + ] + APIClientMock.return_value.auth.authorize.return_value = SimpleNamespace( + authorization_url="http://auth_url" + ) + APIClientMock.return_value.projects.list.return_value = [ + SimpleNamespace(project_name="project1"), + SimpleNamespace(project_name="project2"), + ] + APIClientMock.return_value.base_url = "http://127.0.0.1:31313" + ConfigManagerMock.return_value.get_project_config.return_value = None + user = SimpleNamespace(username="me", creds=SimpleNamespace(token="token")) + LoginServerMock.return_value.get_logged_in_user.return_value = user + exit_code = run_dstack_cli( + [ + "login", + "--url", + "http://127.0.0.1:31313", + "--provider", + "github", + ], + home_dir=tmp_path, + ) + ConfigManagerMock.return_value.configure_project.assert_has_calls( + [ + call( + name="project1", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=True, + ), + call( + name="project2", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + ] + ) + ConfigManagerMock.return_value.save.assert_called() + + assert exit_code == 0 + assert capsys.readouterr().out.replace("\n", "") == ( + "Your browser has been opened to log in with Github:" + "http://auth_url" + "Logged in as me." + "Configured projects: project1, project2." + "Set project project1 as default project." + ) diff --git a/src/tests/_internal/cli/common.py b/src/tests/_internal/cli/common.py index 8b4a370ea6..09f4541c7e 100644 --- a/src/tests/_internal/cli/common.py +++ b/src/tests/_internal/cli/common.py @@ -7,7 +7,7 @@ def run_dstack_cli( - args: List[str], + cli_args: List[str], home_dir: Optional[Path] = None, repo_dir: Optional[Path] = None, ) -> int: @@ -18,13 +18,14 @@ def run_dstack_cli( if home_dir is not None: prev_home_dir = os.environ["HOME"] os.environ["HOME"] = str(home_dir) - with patch("sys.argv", ["dstack"] + args): + with patch("sys.argv", ["dstack"] + cli_args): try: main() except SystemExit as e: exit_code = e.code - if home_dir is not None: - os.environ["HOME"] = prev_home_dir - if repo_dir is not None: - os.chdir(cwd) + finally: + if home_dir is not None: + os.environ["HOME"] = prev_home_dir + if repo_dir is not None: + os.chdir(cwd) return exit_code diff --git a/src/tests/_internal/server/routers/test_auth.py b/src/tests/_internal/server/routers/test_auth.py new file mode 100644 index 0000000000..f4c8bb0e59 --- /dev/null +++ b/src/tests/_internal/server/routers/test_auth.py @@ -0,0 +1,64 @@ +import json +from base64 import b64encode + +import pytest +from httpx import AsyncClient + +from dstack._internal.core.models.auth import OAuthProviderInfo +from dstack._internal.server.services.auth import register_provider + + +class TestListProviders: + @pytest.mark.asyncio + async def test_returns_no_providers(self, client: AsyncClient): + response = await client.post("/api/auth/list_providers") + assert response.status_code == 200 + assert response.json() == [] + + @pytest.mark.asyncio + async def test_returns_registered_providers(self, client: AsyncClient): + register_provider(OAuthProviderInfo(name="provider1", enabled=True)) + register_provider(OAuthProviderInfo(name="provider2", enabled=False)) + response = await client.post("/api/auth/list_providers") + assert response.status_code == 200 + assert response.json() == [ + { + "name": "provider1", + "enabled": True, + }, + { + "name": "provider2", + "enabled": False, + }, + ] + + +class TestGetNextRedirectURL: + @pytest.mark.asyncio + async def test_returns_no_redirect_url_if_local_port_not_set(self, client: AsyncClient): + state = b64encode(json.dumps({"value": "12356", "local_port": None}).encode()).decode() + response = await client.post( + "/api/auth/get_next_redirect", json={"code": "1234", "state": state} + ) + assert response.status_code == 200 + assert response.json() == {"redirect_url": None} + + @pytest.mark.asyncio + async def test_returns_redirect_url_if_local_port_set(self, client: AsyncClient): + state = b64encode(json.dumps({"value": "12356", "local_port": 12345}).encode()).decode() + response = await client.post( + "/api/auth/get_next_redirect", json={"code": "1234", "state": state} + ) + assert response.status_code == 200 + assert response.json() == { + "redirect_url": f"http://localhost:12345/auth/callback?code=1234&state={state}" + } + + @pytest.mark.asyncio + async def test_returns_400_if_state_invalid(self, client: AsyncClient): + state = "some_invalid_state" + response = await client.post( + "/api/auth/get_next_redirect", json={"code": "1234", "state": state} + ) + assert response.status_code == 400 + assert "Invalid state token" in response.json()["detail"][0]["msg"] From fc26a0d3e2579cf47df1fdd31d153edde48bdbe1 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Wed, 24 Dec 2025 11:29:24 +0000 Subject: [PATCH 019/187] Allow users to delete their only project (#3416) The restriction preventing users from deleting their only project was originally introduced because the UI could not function correctly without projects, which is no longer the case. --- src/dstack/_internal/server/services/projects.py | 2 -- src/tests/_internal/server/routers/test_projects.py | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/dstack/_internal/server/services/projects.py b/src/dstack/_internal/server/services/projects.py index 3ef6c32785..937247f5a1 100644 --- a/src/dstack/_internal/server/services/projects.py +++ b/src/dstack/_internal/server/services/projects.py @@ -203,8 +203,6 @@ async def delete_projects( for project in projects_to_delete: if not _is_project_admin(user=user, project=project): raise ForbiddenError() - if all(name in projects_names for name in user_project_names): - raise ServerClientError("Cannot delete the only project") res = await session.execute( select(ProjectModel) diff --git a/src/tests/_internal/server/routers/test_projects.py b/src/tests/_internal/server/routers/test_projects.py index 826ecbc096..4b62ac416d 100644 --- a/src/tests/_internal/server/routers/test_projects.py +++ b/src/tests/_internal/server/routers/test_projects.py @@ -453,7 +453,7 @@ async def test_returns_40x_if_not_authenticated(self, test_db, client: AsyncClie @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) - async def test_cannot_delete_the_only_project( + async def test_deletes_the_only_project( self, test_db, session: AsyncSession, client: AsyncClient ): user = await create_user(session=session, global_role=GlobalRole.USER) @@ -466,9 +466,9 @@ async def test_cannot_delete_the_only_project( headers=get_auth_headers(user.token), json={"projects_names": [project.name]}, ) - assert response.status_code == 400 + assert response.status_code == 200 await session.refresh(project) - assert not project.deleted + assert project.deleted @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) From fe16cb4899a581b9dfc0f633b2166a9776e8e4cd Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Thu, 25 Dec 2025 08:52:43 +0000 Subject: [PATCH 020/187] Indicate deleted actors and projects in Events API (#3422) In `/api/events/list`: - Instead of `_deleted_*` placeholders, return original names of deleted projects and actors - Add the `is_project_deleted` and `is_actor_user_deleted` properties to indicate deleted projects and actors --- src/dstack/_internal/core/models/events.py | 18 ++++++++ .../_internal/server/services/events.py | 46 +++++++++++++------ .../_internal/server/routers/test_events.py | 38 +++++++++++++++ 3 files changed, 88 insertions(+), 14 deletions(-) diff --git a/src/dstack/_internal/core/models/events.py b/src/dstack/_internal/core/models/events.py index caf6d60e47..fc7f51601a 100644 --- a/src/dstack/_internal/core/models/events.py +++ b/src/dstack/_internal/core/models/events.py @@ -46,6 +46,15 @@ class EventTarget(CoreModel): ) ), ] + is_project_deleted: Annotated[ + Optional[bool], + Field( + description=( + "Whether the project the target entity belongs to is deleted," + " or `null` for target types not bound to a project (e.g., users)" + ) + ), + ] = None # default for client compatibility with pre-0.20.1 servers id: Annotated[uuid.UUID, Field(description="ID of the target entity")] name: Annotated[str, Field(description="Name of the target entity")] @@ -72,6 +81,15 @@ class Event(CoreModel): ) ), ] + is_actor_user_deleted: Annotated[ + Optional[bool], + Field( + description=( + "Whether the user who performed the action that triggered the event is deleted," + " or `null` if the action was performed by the system" + ) + ), + ] = None # default for client compatibility with pre-0.20.1 servers targets: Annotated[ list[EventTarget], Field(description="List of entities affected by the event") ] diff --git a/src/dstack/_internal/server/services/events.py b/src/dstack/_internal/server/services/events.py index 7a4d355237..c9818ef9ee 100644 --- a/src/dstack/_internal/server/services/events.py +++ b/src/dstack/_internal/server/services/events.py @@ -364,10 +364,12 @@ async def list_events( ( joinedload(EventModel.targets) .joinedload(EventTargetModel.entity_project) - .load_only(ProjectModel.name) + .load_only(ProjectModel.name, ProjectModel.original_name, ProjectModel.deleted) .noload(ProjectModel.owner) ), - joinedload(EventModel.actor_user).load_only(UserModel.name), + joinedload(EventModel.actor_user).load_only( + UserModel.name, UserModel.original_name, UserModel.deleted + ), ) ) if event_filters: @@ -386,23 +388,39 @@ async def list_events( return list(map(event_model_to_event, event_models)) -def event_model_to_event(event_model: EventModel) -> Event: - targets = [ - EventTarget( - type=target.entity_type.value, - project_id=target.entity_project_id, - project_name=target.entity_project.name if target.entity_project else None, - id=target.entity_id, - name=target.entity_name, - ) - for target in event_model.targets - ] +def event_target_model_to_event_target(model: EventTargetModel) -> EventTarget: + project_name = None + is_project_deleted = None + if model.entity_project is not None: + project_name = model.entity_project.name + is_project_deleted = model.entity_project.deleted + if is_project_deleted and model.entity_project.original_name is not None: + project_name = model.entity_project.original_name + return EventTarget( + type=model.entity_type.value, + project_id=model.entity_project_id, + project_name=project_name, + is_project_deleted=is_project_deleted, + id=model.entity_id, + name=model.entity_name, + ) + +def event_model_to_event(event_model: EventModel) -> Event: + actor_user_name = None + is_actor_user_deleted = None + if event_model.actor_user is not None: + actor_user_name = event_model.actor_user.name + is_actor_user_deleted = event_model.actor_user.deleted + if is_actor_user_deleted and event_model.actor_user.original_name is not None: + actor_user_name = event_model.actor_user.original_name + targets = list(map(event_target_model_to_event_target, event_model.targets)) return Event( id=event_model.id, message=event_model.message, recorded_at=event_model.recorded_at, actor_user_id=event_model.actor_user_id, - actor_user=event_model.actor_user.name if event_model.actor_user else None, + actor_user=actor_user_name, + is_actor_user_deleted=is_actor_user_deleted, targets=targets, ) diff --git a/src/tests/_internal/server/routers/test_events.py b/src/tests/_internal/server/routers/test_events.py index 478474bca7..f31c082d06 100644 --- a/src/tests/_internal/server/routers/test_events.py +++ b/src/tests/_internal/server/routers/test_events.py @@ -68,11 +68,13 @@ async def test_response_format(self, session: AsyncSession, client: AsyncClient) "recorded_at": "2026-01-01T12:00:01+00:00", "actor_user_id": None, "actor_user": None, + "is_actor_user_deleted": None, "targets": [ { "type": "project", "project_id": str(project.id), "project_name": "test_project", + "is_project_deleted": False, "id": str(project.id), "name": "test_project", }, @@ -84,11 +86,13 @@ async def test_response_format(self, session: AsyncSession, client: AsyncClient) "recorded_at": "2026-01-01T12:00:00+00:00", "actor_user_id": str(user.id), "actor_user": "test_user", + "is_actor_user_deleted": False, "targets": [ { "type": "project", "project_id": str(project.id), "project_name": "test_project", + "is_project_deleted": False, "id": str(project.id), "name": "test_project", }, @@ -96,6 +100,7 @@ async def test_response_format(self, session: AsyncSession, client: AsyncClient) "type": "user", "project_id": None, "project_name": None, + "is_project_deleted": None, "id": str(user.id), "name": "test_user", }, @@ -103,6 +108,39 @@ async def test_response_format(self, session: AsyncSession, client: AsyncClient) }, ] + async def test_deleted_actor_and_project( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session=session, name="test_user") + project = await create_project(session=session, owner=user, name="test_project") + events.emit( + session, + "Project deleted", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(project)], + ) + user.original_name = user.name + user.name = "_deleted_user_placeholder" + user.deleted = True + project.original_name = project.name + project.name = "_deleted_project_placeholder" + project.deleted = True + await session.commit() + other_user = await create_user(session=session, name="other_user") + + resp = await client.post( + "/api/events/list", headers=get_auth_headers(other_user.token), json={} + ) + resp.raise_for_status() + assert len(resp.json()) == 1 + assert resp.json()[0]["actor_user_id"] == str(user.id) + assert resp.json()[0]["actor_user"] == "test_user" + assert resp.json()[0]["is_actor_user_deleted"] == True + assert len(resp.json()[0]["targets"]) == 1 + assert resp.json()[0]["targets"][0]["project_id"] == str(project.id) + assert resp.json()[0]["targets"][0]["project_name"] == "test_project" + assert resp.json()[0]["targets"][0]["is_project_deleted"] == True + async def test_empty_response_when_no_events( self, session: AsyncSession, client: AsyncClient ) -> None: From 62156467fc1bc640699fca77ccf6c2450cb9f9d6 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Thu, 25 Dec 2025 10:06:26 +0100 Subject: [PATCH 021/187] [UX] Make "No fleets" run status more explicit #3405 (#3406) --- frontend/src/libs/run.ts | 4 ++ .../pages/Runs/Details/RunDetails/index.tsx | 6 ++- .../Runs/List/hooks/useColumnsDefinitions.tsx | 5 ++- .../cli/services/configurators/run.py | 7 +++- src/dstack/_internal/cli/utils/common.py | 6 +++ src/dstack/_internal/cli/utils/run.py | 20 +++++++-- .../tasks/process_submitted_jobs.py | 1 + .../server/services/jobs/__init__.py | 5 +++ src/tests/_internal/cli/utils/test_run.py | 41 +++++++++++++++---- 9 files changed, 79 insertions(+), 16 deletions(-) diff --git a/frontend/src/libs/run.ts b/frontend/src/libs/run.ts index e49e4c28fa..b1a626bf82 100644 --- a/frontend/src/libs/run.ts +++ b/frontend/src/libs/run.ts @@ -39,7 +39,11 @@ export const getStatusIconType = ( export const getStatusIconColor = ( status: IRun['status'] | TJobStatus, terminationReason: string | null | undefined, + statusMessage: string, ): StatusIndicatorProps.Color | undefined => { + if (statusMessage === 'No fleets') { + return 'red'; + } if (terminationReason === 'failed_to_start_due_to_no_capacity' || terminationReason === 'interrupted_by_no_capacity') { return 'yellow'; } diff --git a/frontend/src/pages/Runs/Details/RunDetails/index.tsx b/frontend/src/pages/Runs/Details/RunDetails/index.tsx index c00b2ce9d2..24e5c2718d 100644 --- a/frontend/src/pages/Runs/Details/RunDetails/index.tsx +++ b/frontend/src/pages/Runs/Details/RunDetails/index.tsx @@ -63,6 +63,8 @@ export const RunDetails = () => { const finishedAt = getRunListFinishedAt(runData); + const statusMessage = getRunStatusMessage(runData); + return ( <> {t('common.general')}}> @@ -113,9 +115,9 @@ export const RunDetails = () => {
- {getRunStatusMessage(runData)} + {statusMessage}
diff --git a/frontend/src/pages/Runs/List/hooks/useColumnsDefinitions.tsx b/frontend/src/pages/Runs/List/hooks/useColumnsDefinitions.tsx index 9f05143429..285c29ad9f 100644 --- a/frontend/src/pages/Runs/List/hooks/useColumnsDefinitions.tsx +++ b/frontend/src/pages/Runs/List/hooks/useColumnsDefinitions.tsx @@ -84,13 +84,14 @@ export const useColumnsDefinitions = () => { const terminationReason = finishedRunStatuses.includes(item.status) ? item.latest_job_submission?.termination_reason : null; + const statusMessage = getRunStatusMessage(item); return ( - {getRunStatusMessage(item)} + {statusMessage} ); }, diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py index f942ca05b0..d025160d0c 100644 --- a/src/dstack/_internal/cli/services/configurators/run.py +++ b/src/dstack/_internal/cli/services/configurators/run.py @@ -106,7 +106,12 @@ def apply_configuration( ssh_identity_file=configurator_args.ssh_identity_file, ) - print_run_plan(run_plan, max_offers=configurator_args.max_offers) + no_fleets = False + if len(run_plan.job_plans[0].offers) == 0: + if len(self.api.client.fleets.list(self.api.project)) == 0: + no_fleets = True + + print_run_plan(run_plan, max_offers=configurator_args.max_offers, no_fleets=no_fleets) confirm_message = "Submit a new run?" if conf.name: diff --git a/src/dstack/_internal/cli/utils/common.py b/src/dstack/_internal/cli/utils/common.py index 87f0687e1b..e49a2b596d 100644 --- a/src/dstack/_internal/cli/utils/common.py +++ b/src/dstack/_internal/cli/utils/common.py @@ -35,6 +35,12 @@ " https://dstack.ai/docs/guides/troubleshooting/#no-offers" "[/]\n" ) +NO_FLEETS_WARNING = ( + "[warning]" + "The project has no fleets. Create one before submitting a run:" + " https://dstack.ai/docs/concepts/fleets" + "[/]\n" +) def cli_error(e: DstackError) -> CLIError: diff --git a/src/dstack/_internal/cli/utils/run.py b/src/dstack/_internal/cli/utils/run.py index 68dc828f79..1b6dfbaeda 100644 --- a/src/dstack/_internal/cli/utils/run.py +++ b/src/dstack/_internal/cli/utils/run.py @@ -6,7 +6,12 @@ from dstack._internal.cli.models.offers import OfferCommandOutput, OfferRequirements from dstack._internal.cli.models.runs import PsCommandOutput -from dstack._internal.cli.utils.common import NO_OFFERS_WARNING, add_row_from_dict, console +from dstack._internal.cli.utils.common import ( + NO_FLEETS_WARNING, + NO_OFFERS_WARNING, + add_row_from_dict, + console, +) from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.configurations import DevEnvironmentConfiguration from dstack._internal.core.models.instances import ( @@ -75,7 +80,10 @@ def print_runs_json(project: str, runs: List[Run]) -> None: def print_run_plan( - run_plan: RunPlan, max_offers: Optional[int] = None, include_run_properties: bool = True + run_plan: RunPlan, + max_offers: Optional[int] = None, + include_run_properties: bool = True, + no_fleets: bool = False, ): run_spec = run_plan.get_effective_run_spec() job_plan = run_plan.job_plans[0] @@ -195,7 +203,7 @@ def th(s: str) -> str: ) console.print() else: - console.print(NO_OFFERS_WARNING) + console.print(NO_FLEETS_WARNING if no_fleets else NO_OFFERS_WARNING) def _format_run_status(run) -> str: @@ -215,8 +223,10 @@ def _format_run_status(run) -> str: RunStatus.FAILED: "indian_red1", RunStatus.DONE: "grey", } - if status_text == "no offers" or status_text == "interrupted": + if status_text in ("no offers", "interrupted"): color = "gold1" + elif status_text == "no fleets": + color = "indian_red1" elif status_text == "pulling": color = "sea_green3" else: @@ -230,6 +240,8 @@ def _format_job_submission_status(job_submission: JobSubmission, verbose: bool) job_status = job_submission.status if status_message in ("no offers", "interrupted"): color = "gold1" + elif status_message == "no fleets": + color = "indian_red1" elif status_message == "stopped": color = "grey" else: diff --git a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py index 21a5e4bffc..4ddd6a13d7 100644 --- a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py @@ -349,6 +349,7 @@ async def _process_submitted_job( job_model.termination_reason = ( JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY ) + # Note: `_get_job_status_message` relies on the "No fleet found" substring to return "no fleets" job_model.termination_reason_message = ( "No fleet found. Create it before submitting a run: " "https://dstack.ai/docs/concepts/fleets" diff --git a/src/dstack/_internal/server/services/jobs/__init__.py b/src/dstack/_internal/server/services/jobs/__init__.py index 1ed3c5f99e..68fea166c1 100644 --- a/src/dstack/_internal/server/services/jobs/__init__.py +++ b/src/dstack/_internal/server/services/jobs/__init__.py @@ -804,6 +804,11 @@ def _get_job_status_message(job_model: JobModel) -> str: elif ( job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY ): + if ( + job_model.termination_reason_message + and "No fleet found" in job_model.termination_reason_message + ): + return "no fleets" return "no offers" elif job_model.termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY: return "interrupted" diff --git a/src/tests/_internal/cli/utils/test_run.py b/src/tests/_internal/cli/utils/test_run.py index b824c001aa..20f37a820b 100644 --- a/src/tests/_internal/cli/utils/test_run.py +++ b/src/tests/_internal/cli/utils/test_run.py @@ -96,6 +96,7 @@ async def create_run_with_job( job_provisioning_data: Optional[JobProvisioningData] = None, termination_reason: Optional[JobTerminationReason] = None, exit_status: Optional[int] = None, + termination_reason_message: Optional[str] = None, submitted_at: Optional[datetime] = None, ) -> Run: if submitted_at is None: @@ -178,6 +179,9 @@ async def create_run_with_job( if exit_status is not None: job_model.exit_status = exit_status + if termination_reason_message is not None: + job_model.termination_reason_message = termination_reason_message + if exit_status is not None or termination_reason_message is not None: await session.commit() await session.refresh(run_model_db) @@ -226,13 +230,14 @@ async def test_simple_run(self, session: AsyncSession): assert status_style == "bold sea_green3" @pytest.mark.parametrize( - "job_status,termination_reason,exit_status,expected_status,expected_style", + "job_status,termination_reason,exit_status,termination_reason_message,expected_status,expected_style", [ - (JobStatus.DONE, None, None, "exited (0)", "grey"), + (JobStatus.DONE, None, None, None, "exited (0)", "grey"), ( JobStatus.FAILED, JobTerminationReason.CONTAINER_EXITED_WITH_ERROR, 1, + None, "exited (1)", "indian_red1", ), @@ -240,6 +245,7 @@ async def test_simple_run(self, session: AsyncSession): JobStatus.FAILED, JobTerminationReason.CONTAINER_EXITED_WITH_ERROR, 42, + None, "exited (42)", "indian_red1", ), @@ -247,13 +253,23 @@ async def test_simple_run(self, session: AsyncSession): JobStatus.FAILED, JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, None, + None, "no offers", "gold1", ), + ( + JobStatus.FAILED, + JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + None, + "No fleet found. Create it before submitting a run: https://dstack.ai/docs/concepts/fleets", + "no fleets", + "indian_red1", + ), ( JobStatus.FAILED, JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY, None, + None, "interrupted", "gold1", ), @@ -261,6 +277,7 @@ async def test_simple_run(self, session: AsyncSession): JobStatus.FAILED, JobTerminationReason.INSTANCE_UNREACHABLE, None, + None, "error", "indian_red1", ), @@ -268,14 +285,22 @@ async def test_simple_run(self, session: AsyncSession): JobStatus.TERMINATED, JobTerminationReason.TERMINATED_BY_USER, None, + None, "stopped", "grey", ), - (JobStatus.TERMINATED, JobTerminationReason.ABORTED_BY_USER, None, "aborted", "grey"), - (JobStatus.RUNNING, None, None, "running", "bold sea_green3"), - (JobStatus.PROVISIONING, None, None, "provisioning", "bold deep_sky_blue1"), - (JobStatus.PULLING, None, None, "pulling", "bold sea_green3"), - (JobStatus.TERMINATING, None, None, "terminating", "bold deep_sky_blue1"), + ( + JobStatus.TERMINATED, + JobTerminationReason.ABORTED_BY_USER, + None, + None, + "aborted", + "grey", + ), + (JobStatus.RUNNING, None, None, None, "running", "bold sea_green3"), + (JobStatus.PROVISIONING, None, None, None, "provisioning", "bold deep_sky_blue1"), + (JobStatus.PULLING, None, None, None, "pulling", "bold sea_green3"), + (JobStatus.TERMINATING, None, None, None, "terminating", "bold deep_sky_blue1"), ], ) async def test_status_messages( @@ -284,6 +309,7 @@ async def test_status_messages( job_status: JobStatus, termination_reason: Optional[JobTerminationReason], exit_status: Optional[int], + termination_reason_message: Optional[str], expected_status: str, expected_style: str, ): @@ -292,6 +318,7 @@ async def test_status_messages( job_status=job_status, termination_reason=termination_reason, exit_status=exit_status, + termination_reason_message=termination_reason_message, ) table = get_runs_table([api_run], verbose=False) From 06448719889dc2da5e7fa88735b7823f20969b28 Mon Sep 17 00:00:00 2001 From: Oleg Date: Thu, 25 Dec 2025 12:25:10 +0300 Subject: [PATCH 022/187] No fleets notification #373 (#3418) * No fleets notification #373 * Fixed styles https://github.com/dstackai/dstack-cloud/issues/373 * Fixed after review https://github.com/dstackai/dstack-cloud/issues/373 * Fixed after review https://github.com/dstackai/dstack-cloud/issues/373 * Added notification about fleets on project details page https://github.com/dstackai/dstack-cloud/issues/373 * Get only active fleets --- .../useCheckingForFleetsInProjectsOfMember.ts | 51 ++++++++ frontend/src/locale/en.json | 5 + frontend/src/pages/Fleets/List/index.tsx | 71 ++++++++--- .../src/pages/Fleets/List/styles.module.scss | 12 ++ .../pages/Project/Details/Settings/index.tsx | 32 ++++- .../Details/Settings/styles.module.scss | 11 ++ frontend/src/pages/Runs/List/index.tsx | 119 +++++++++++------- .../src/pages/Runs/List/styles.module.scss | 12 ++ frontend/src/services/fleet.ts | 2 +- 9 files changed, 249 insertions(+), 66 deletions(-) create mode 100644 frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts diff --git a/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts b/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts new file mode 100644 index 0000000000..1028336070 --- /dev/null +++ b/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts @@ -0,0 +1,51 @@ +import { useEffect, useMemo, useState } from 'react'; + +import { useLazyGetFleetsQuery } from '../services/fleet'; +import { useGetProjectsQuery } from '../services/project'; + +type Args = { projectNames?: IProject['project_name'][] }; + +export const useCheckingForFleetsInProjects = ({ projectNames }: Args) => { + const [projectFleetMap, setProjectFleetMap] = useState>({}); + const { data: projectsData } = useGetProjectsQuery(undefined, { + skip: !!projectNames?.length, + }); + + const [getFleets] = useLazyGetFleetsQuery(); + + const projectNameForChecking = useMemo(() => { + if (projectNames) { + return projectNames; + } + + if (projectsData) { + return projectsData.map((project) => project.project_name); + } + + return []; + }, [projectNames, projectsData]); + + useEffect(() => { + const fetchFleets = async () => { + const map: Record = {}; + + await Promise.all( + projectNameForChecking.map((projectName) => + getFleets({ + limit: 1, + project_name: projectName, + only_active: true, + }) + .unwrap() + .then((data) => (map[projectName] = Boolean(data.length))), + ), + ); + + setProjectFleetMap(map); + }; + + fetchFleets(); + }, [projectNameForChecking]); + + return projectFleetMap; +}; diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index 7c07a5f938..f026151083 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -564,6 +564,11 @@ }, "fleets": { + "no_alert": { + "title": "No fleets", + "description": "The project has no fleets. Create one before submitting a run.", + "button_title": "Create a fleet" + }, "fleet": "Fleet", "fleet_placeholder": "Filtering by fleet", "fleet_name": "Fleet name", diff --git a/frontend/src/pages/Fleets/List/index.tsx b/frontend/src/pages/Fleets/List/index.tsx index 3ac92310fb..0966d89d1e 100644 --- a/frontend/src/pages/Fleets/List/index.tsx +++ b/frontend/src/pages/Fleets/List/index.tsx @@ -1,10 +1,13 @@ import React from 'react'; import { useTranslation } from 'react-i18next'; +import { ButtonProps } from '@cloudscape-design/components/button'; -import { Button, Header, Loader, PropertyFilter, SpaceBetween, Table, Toggle } from 'components'; +import { Alert, Button, Header, Loader, PropertyFilter, SpaceBetween, Table, Toggle } from 'components'; import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; import { useBreadcrumbs, useCollection, useInfiniteScroll } from 'hooks'; +import { useCheckingForFleetsInProjects } from 'hooks/useCheckingForFleetsInProjectsOfMember'; +import { goToUrl } from 'libs'; import { ROUTES } from 'routes'; import { useLazyGetFleetsQuery } from 'services/fleet'; @@ -35,6 +38,8 @@ export const FleetList: React.FC = () => { isDisabledClearFilter, } = useFilters(); + const projectHavingFleetMap = useCheckingForFleetsInProjects({}); + const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll({ useLazyQuery: useLazyGetFleetsQuery, args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE }, @@ -67,6 +72,13 @@ export const FleetList: React.FC = () => { deleteFleets([...selectedItems]).catch(console.log); }; + const projectDontHasFleet = Object.keys(projectHavingFleetMap).find((project) => !projectHavingFleetMap[project]); + + const onCreateAFleet: ButtonProps['onClick'] = (event) => { + event.preventDefault(); + goToUrl('https://dstack.ai/docs/quickstart/#create-a-fleet', true); + }; + return (
{ stickyHeader={true} selectionType="multi" header={ -
- - -
+ <> + {projectDontHasFleet && ( +
+ + {t('fleets.no_alert.button_title')} + + } + > + The project {projectDontHasFleet} has no fleets. Create one before submitting a + run. + +
+ )} + +
+ + +
+ } filter={
diff --git a/frontend/src/pages/Fleets/List/styles.module.scss b/frontend/src/pages/Fleets/List/styles.module.scss index 022678e83e..1972454295 100644 --- a/frontend/src/pages/Fleets/List/styles.module.scss +++ b/frontend/src/pages/Fleets/List/styles.module.scss @@ -1,3 +1,15 @@ +.alertBox { + margin-bottom: 12px; + + :global { + & [class^="awsui_alert"] { + & [class^="awsui_action-slot"] { + display: flex; + align-items: center; + } + } + } +} .filters { display: flex; flex-wrap: wrap; diff --git a/frontend/src/pages/Project/Details/Settings/index.tsx b/frontend/src/pages/Project/Details/Settings/index.tsx index 2cd6b4915a..7a0fbfb8e6 100644 --- a/frontend/src/pages/Project/Details/Settings/index.tsx +++ b/frontend/src/pages/Project/Details/Settings/index.tsx @@ -3,9 +3,11 @@ import { useTranslation } from 'react-i18next'; import { useNavigate, useParams } from 'react-router-dom'; import { debounce } from 'lodash'; import { ExpandableSection, Tabs } from '@cloudscape-design/components'; +import { ButtonProps } from '@cloudscape-design/components/button'; import Wizard from '@cloudscape-design/components/wizard'; import { + Alert, Box, Button, ButtonWithConfirmation, @@ -22,7 +24,7 @@ import { import { HotspotIds } from 'layouts/AppLayout/TutorialPanel/constants'; import { useBreadcrumbs, useNotifications } from 'hooks'; -import { riseRouterException } from 'libs'; +import { goToUrl, riseRouterException } from 'libs'; import { copyToClipboard } from 'libs'; import { ROUTES } from 'routes'; import { useGetProjectQuery, useUpdateProjectMembersMutation, useUpdateProjectMutation } from 'services/project'; @@ -35,6 +37,7 @@ import { useDeleteProject } from 'pages/Project/hooks/useDeleteProject'; import { ProjectMembers } from 'pages/Project/Members'; import { getProjectRoleByUserName } from 'pages/Project/utils'; +import { useCheckingForFleetsInProjects } from '../../../../hooks/useCheckingForFleetsInProjectsOfMember'; import { useBackendsTable } from '../../Backends/hooks'; import { BackendsTable } from '../../Backends/Table'; import { GatewaysTable } from '../../Gateways'; @@ -60,6 +63,10 @@ export const ProjectSettings: React.FC = () => { const { deleteProject, isDeleting } = useDeleteProject(); const { data: currentUser } = useGetUserDataQuery({}); + const projectNames = useMemo(() => [paramProjectName], [paramProjectName]); + + const projectHavingFleetMap = useCheckingForFleetsInProjects({ projectNames }); + const { data, isLoading, error } = useGetProjectQuery({ name: paramProjectName }); const { data: runsData } = useGetRunsQuery({ @@ -180,6 +187,13 @@ export const ProjectSettings: React.FC = () => { const [activeStepIndex, setActiveStepIndex] = React.useState(0); + const projectDontHasFleet = !projectHavingFleetMap?.[paramProjectName]; + + const onCreateAFleet: ButtonProps['onClick'] = (event) => { + event.preventDefault(); + goToUrl('https://dstack.ai/docs/quickstart/#create-a-fleet', true); + }; + if (isLoadingPage) return ( @@ -191,6 +205,22 @@ export const ProjectSettings: React.FC = () => { <> {data && backendsData && gatewaysData && ( + {projectDontHasFleet && ( +
+ + {t('fleets.no_alert.button_title')} + + } + > + The project {paramProjectName} has no fleets. Create one before submitting a run. + +
+ )} + {isProjectMember && ( { localStorePrefix: 'administration-run-list-page', }); + const projectHavingFleetMap = useCheckingForFleetsInProjects({}); + const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll({ useLazyQuery: useLazyGetRunsQuery, args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE, job_submissions_limit: 1 }, @@ -117,6 +122,13 @@ export const RunList: React.FC = () => { } }; + const projectDontHasFleet = Object.keys(projectHavingFleetMap).find((project) => !projectHavingFleetMap[project]); + + const onCreateAFleet: ButtonProps['onClick'] = (event) => { + event.preventDefault(); + goToUrl('https://dstack.ai/docs/quickstart/#create-a-fleet', true); + }; + return (
{ columnDisplay={preferences.contentDisplay} preferences={} header={ -
- + {projectDontHasFleet && ( +
+ + {t('fleets.no_alert.button_title')} + + } > - {t('common.new')} - - - - - - - {/**/} - -
+ The project {projectDontHasFleet} has no fleets. Create one before submitting a + run. + + + )} + +
+ + {t('common.new')} + + + + + + + {/**/} + +
+ } filter={
diff --git a/frontend/src/pages/Runs/List/styles.module.scss b/frontend/src/pages/Runs/List/styles.module.scss index 0b5efa7b66..5100d53f5f 100644 --- a/frontend/src/pages/Runs/List/styles.module.scss +++ b/frontend/src/pages/Runs/List/styles.module.scss @@ -1,3 +1,15 @@ +.alertBox { + margin-bottom: 12px; + + :global { + & [class^="awsui_alert"] { + & [class^="awsui_action-slot"] { + display: flex; + align-items: center; + } + } + } +} .selectFilters { display: flex; flex-wrap: wrap; diff --git a/frontend/src/services/fleet.ts b/frontend/src/services/fleet.ts index e74753b3ce..3405a18b8b 100644 --- a/frontend/src/services/fleet.ts +++ b/frontend/src/services/fleet.ts @@ -69,4 +69,4 @@ export const fleetApi = createApi({ }), }); -export const { useLazyGetFleetsQuery, useDeleteFleetMutation, useGetFleetDetailsQuery } = fleetApi; +export const { useGetFleetsQuery, useLazyGetFleetsQuery, useDeleteFleetMutation, useGetFleetDetailsQuery } = fleetApi; From 6be8f530c7a7a1e9adc2742934a3194cc1a16584 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Thu, 25 Dec 2025 14:33:14 +0500 Subject: [PATCH 023/187] Bump gpuhunt==0.1.16 (#3423) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e540705d93..c0036ff7be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "python-multipart>=0.0.16", "filelock", "psutil", - "gpuhunt==0.1.15", + "gpuhunt==0.1.16", "argcomplete>=3.5.0", "ignore-python>=0.2.0", "orjson", From 178abdc2817fd3abbf556f240f4760e75ec5e8db Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Thu, 25 Dec 2025 15:40:33 +0500 Subject: [PATCH 024/187] Revert "No fleets notification #373 (#3418)" (#3424) This reverts commit 06448719889dc2da5e7fa88735b7823f20969b28. --- .../useCheckingForFleetsInProjectsOfMember.ts | 51 -------- frontend/src/locale/en.json | 5 - frontend/src/pages/Fleets/List/index.tsx | 71 +++-------- .../src/pages/Fleets/List/styles.module.scss | 12 -- .../pages/Project/Details/Settings/index.tsx | 32 +---- .../Details/Settings/styles.module.scss | 11 -- frontend/src/pages/Runs/List/index.tsx | 119 +++++++----------- .../src/pages/Runs/List/styles.module.scss | 12 -- frontend/src/services/fleet.ts | 2 +- 9 files changed, 66 insertions(+), 249 deletions(-) delete mode 100644 frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts diff --git a/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts b/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts deleted file mode 100644 index 1028336070..0000000000 --- a/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts +++ /dev/null @@ -1,51 +0,0 @@ -import { useEffect, useMemo, useState } from 'react'; - -import { useLazyGetFleetsQuery } from '../services/fleet'; -import { useGetProjectsQuery } from '../services/project'; - -type Args = { projectNames?: IProject['project_name'][] }; - -export const useCheckingForFleetsInProjects = ({ projectNames }: Args) => { - const [projectFleetMap, setProjectFleetMap] = useState>({}); - const { data: projectsData } = useGetProjectsQuery(undefined, { - skip: !!projectNames?.length, - }); - - const [getFleets] = useLazyGetFleetsQuery(); - - const projectNameForChecking = useMemo(() => { - if (projectNames) { - return projectNames; - } - - if (projectsData) { - return projectsData.map((project) => project.project_name); - } - - return []; - }, [projectNames, projectsData]); - - useEffect(() => { - const fetchFleets = async () => { - const map: Record = {}; - - await Promise.all( - projectNameForChecking.map((projectName) => - getFleets({ - limit: 1, - project_name: projectName, - only_active: true, - }) - .unwrap() - .then((data) => (map[projectName] = Boolean(data.length))), - ), - ); - - setProjectFleetMap(map); - }; - - fetchFleets(); - }, [projectNameForChecking]); - - return projectFleetMap; -}; diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index f026151083..7c07a5f938 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -564,11 +564,6 @@ }, "fleets": { - "no_alert": { - "title": "No fleets", - "description": "The project has no fleets. Create one before submitting a run.", - "button_title": "Create a fleet" - }, "fleet": "Fleet", "fleet_placeholder": "Filtering by fleet", "fleet_name": "Fleet name", diff --git a/frontend/src/pages/Fleets/List/index.tsx b/frontend/src/pages/Fleets/List/index.tsx index 0966d89d1e..3ac92310fb 100644 --- a/frontend/src/pages/Fleets/List/index.tsx +++ b/frontend/src/pages/Fleets/List/index.tsx @@ -1,13 +1,10 @@ import React from 'react'; import { useTranslation } from 'react-i18next'; -import { ButtonProps } from '@cloudscape-design/components/button'; -import { Alert, Button, Header, Loader, PropertyFilter, SpaceBetween, Table, Toggle } from 'components'; +import { Button, Header, Loader, PropertyFilter, SpaceBetween, Table, Toggle } from 'components'; import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; import { useBreadcrumbs, useCollection, useInfiniteScroll } from 'hooks'; -import { useCheckingForFleetsInProjects } from 'hooks/useCheckingForFleetsInProjectsOfMember'; -import { goToUrl } from 'libs'; import { ROUTES } from 'routes'; import { useLazyGetFleetsQuery } from 'services/fleet'; @@ -38,8 +35,6 @@ export const FleetList: React.FC = () => { isDisabledClearFilter, } = useFilters(); - const projectHavingFleetMap = useCheckingForFleetsInProjects({}); - const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll({ useLazyQuery: useLazyGetFleetsQuery, args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE }, @@ -72,13 +67,6 @@ export const FleetList: React.FC = () => { deleteFleets([...selectedItems]).catch(console.log); }; - const projectDontHasFleet = Object.keys(projectHavingFleetMap).find((project) => !projectHavingFleetMap[project]); - - const onCreateAFleet: ButtonProps['onClick'] = (event) => { - event.preventDefault(); - goToUrl('https://dstack.ai/docs/quickstart/#create-a-fleet', true); - }; - return (
{ stickyHeader={true} selectionType="multi" header={ - <> - {projectDontHasFleet && ( -
- - {t('fleets.no_alert.button_title')} - - } - > - The project {projectDontHasFleet} has no fleets. Create one before submitting a - run. - -
- )} - -
- - -
- +
+ + +
} filter={
diff --git a/frontend/src/pages/Fleets/List/styles.module.scss b/frontend/src/pages/Fleets/List/styles.module.scss index 1972454295..022678e83e 100644 --- a/frontend/src/pages/Fleets/List/styles.module.scss +++ b/frontend/src/pages/Fleets/List/styles.module.scss @@ -1,15 +1,3 @@ -.alertBox { - margin-bottom: 12px; - - :global { - & [class^="awsui_alert"] { - & [class^="awsui_action-slot"] { - display: flex; - align-items: center; - } - } - } -} .filters { display: flex; flex-wrap: wrap; diff --git a/frontend/src/pages/Project/Details/Settings/index.tsx b/frontend/src/pages/Project/Details/Settings/index.tsx index 7a0fbfb8e6..2cd6b4915a 100644 --- a/frontend/src/pages/Project/Details/Settings/index.tsx +++ b/frontend/src/pages/Project/Details/Settings/index.tsx @@ -3,11 +3,9 @@ import { useTranslation } from 'react-i18next'; import { useNavigate, useParams } from 'react-router-dom'; import { debounce } from 'lodash'; import { ExpandableSection, Tabs } from '@cloudscape-design/components'; -import { ButtonProps } from '@cloudscape-design/components/button'; import Wizard from '@cloudscape-design/components/wizard'; import { - Alert, Box, Button, ButtonWithConfirmation, @@ -24,7 +22,7 @@ import { import { HotspotIds } from 'layouts/AppLayout/TutorialPanel/constants'; import { useBreadcrumbs, useNotifications } from 'hooks'; -import { goToUrl, riseRouterException } from 'libs'; +import { riseRouterException } from 'libs'; import { copyToClipboard } from 'libs'; import { ROUTES } from 'routes'; import { useGetProjectQuery, useUpdateProjectMembersMutation, useUpdateProjectMutation } from 'services/project'; @@ -37,7 +35,6 @@ import { useDeleteProject } from 'pages/Project/hooks/useDeleteProject'; import { ProjectMembers } from 'pages/Project/Members'; import { getProjectRoleByUserName } from 'pages/Project/utils'; -import { useCheckingForFleetsInProjects } from '../../../../hooks/useCheckingForFleetsInProjectsOfMember'; import { useBackendsTable } from '../../Backends/hooks'; import { BackendsTable } from '../../Backends/Table'; import { GatewaysTable } from '../../Gateways'; @@ -63,10 +60,6 @@ export const ProjectSettings: React.FC = () => { const { deleteProject, isDeleting } = useDeleteProject(); const { data: currentUser } = useGetUserDataQuery({}); - const projectNames = useMemo(() => [paramProjectName], [paramProjectName]); - - const projectHavingFleetMap = useCheckingForFleetsInProjects({ projectNames }); - const { data, isLoading, error } = useGetProjectQuery({ name: paramProjectName }); const { data: runsData } = useGetRunsQuery({ @@ -187,13 +180,6 @@ export const ProjectSettings: React.FC = () => { const [activeStepIndex, setActiveStepIndex] = React.useState(0); - const projectDontHasFleet = !projectHavingFleetMap?.[paramProjectName]; - - const onCreateAFleet: ButtonProps['onClick'] = (event) => { - event.preventDefault(); - goToUrl('https://dstack.ai/docs/quickstart/#create-a-fleet', true); - }; - if (isLoadingPage) return ( @@ -205,22 +191,6 @@ export const ProjectSettings: React.FC = () => { <> {data && backendsData && gatewaysData && ( - {projectDontHasFleet && ( -
- - {t('fleets.no_alert.button_title')} - - } - > - The project {paramProjectName} has no fleets. Create one before submitting a run. - -
- )} - {isProjectMember && ( { localStorePrefix: 'administration-run-list-page', }); - const projectHavingFleetMap = useCheckingForFleetsInProjects({}); - const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll({ useLazyQuery: useLazyGetRunsQuery, args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE, job_submissions_limit: 1 }, @@ -122,13 +117,6 @@ export const RunList: React.FC = () => { } }; - const projectDontHasFleet = Object.keys(projectHavingFleetMap).find((project) => !projectHavingFleetMap[project]); - - const onCreateAFleet: ButtonProps['onClick'] = (event) => { - event.preventDefault(); - goToUrl('https://dstack.ai/docs/quickstart/#create-a-fleet', true); - }; - return (
{ columnDisplay={preferences.contentDisplay} preferences={} header={ - <> - {projectDontHasFleet && ( -
- - {t('fleets.no_alert.button_title')} - - } +
+ - The project {projectDontHasFleet} has no fleets. Create one before submitting a - run. - -
- )} - -
- - {t('common.new')} - - - - - - - {/**/} - -
- + {t('common.new')} + + + + + + + {/**/} + +
{ stickyHeader={true} selectionType="multi" header={ -
- - -
+ <> + + +
+ + +
+ } filter={
diff --git a/frontend/src/pages/Fleets/List/styles.module.scss b/frontend/src/pages/Fleets/List/styles.module.scss index 022678e83e..ec38338c42 100644 --- a/frontend/src/pages/Fleets/List/styles.module.scss +++ b/frontend/src/pages/Fleets/List/styles.module.scss @@ -1,3 +1,6 @@ +.noFleetAlert { + margin-bottom: 12px; +} .filters { display: flex; flex-wrap: wrap; diff --git a/frontend/src/pages/Project/Details/Settings/index.tsx b/frontend/src/pages/Project/Details/Settings/index.tsx index 2cd6b4915a..7d2b9bd3f0 100644 --- a/frontend/src/pages/Project/Details/Settings/index.tsx +++ b/frontend/src/pages/Project/Details/Settings/index.tsx @@ -22,6 +22,7 @@ import { import { HotspotIds } from 'layouts/AppLayout/TutorialPanel/constants'; import { useBreadcrumbs, useNotifications } from 'hooks'; +import { useCheckingForFleetsInProjects } from 'hooks/useCheckingForFleetsInProjectsOfMember'; import { riseRouterException } from 'libs'; import { copyToClipboard } from 'libs'; import { ROUTES } from 'routes'; @@ -37,6 +38,7 @@ import { getProjectRoleByUserName } from 'pages/Project/utils'; import { useBackendsTable } from '../../Backends/hooks'; import { BackendsTable } from '../../Backends/Table'; +import { NoFleetProjectAlert } from '../../components/NoFleetProjectAlert'; import { GatewaysTable } from '../../Gateways'; import { useGatewaysTable } from '../../Gateways/hooks'; import { ProjectSecrets } from '../../Secrets'; @@ -60,6 +62,10 @@ export const ProjectSettings: React.FC = () => { const { deleteProject, isDeleting } = useDeleteProject(); const { data: currentUser } = useGetUserDataQuery({}); + const projectNames = useMemo(() => [paramProjectName], [paramProjectName]); + + const projectHavingFleetMap = useCheckingForFleetsInProjects({ projectNames }); + const { data, isLoading, error } = useGetProjectQuery({ name: paramProjectName }); const { data: runsData } = useGetRunsQuery({ @@ -180,6 +186,8 @@ export const ProjectSettings: React.FC = () => { const [activeStepIndex, setActiveStepIndex] = React.useState(0); + const projectDontHasFleet = !projectHavingFleetMap?.[paramProjectName]; + if (isLoadingPage) return ( @@ -191,6 +199,8 @@ export const ProjectSettings: React.FC = () => { <> {data && backendsData && gatewaysData && ( + + {isProjectMember && ( = ({ projectName, show, className, dismissible }) => { + const { t } = useTranslation(); + const [dontShowAgain, setDontShowAgain] = useLocalStorageState(`noFleetProjectAlert-${projectName}`, false); + + const onCreateAFleet: ButtonProps['onClick'] = (event) => { + event.preventDefault(); + goToUrl('https://dstack.ai/docs/quickstart/#create-a-fleet', true); + }; + + const onDismiss: AlertProps['onDismiss'] = () => setDontShowAgain(true); + + if (!show || dontShowAgain) { + return null; + } + + return ( +
+ + {t('fleets.no_alert.button_title')} + + } + > + The project {projectName} has no fleets. Create one before submitting a run. + +
+ ); +}; diff --git a/frontend/src/pages/Project/components/NoFleetProjectAlert/styles.module.scss b/frontend/src/pages/Project/components/NoFleetProjectAlert/styles.module.scss new file mode 100644 index 0000000000..c49d1793fb --- /dev/null +++ b/frontend/src/pages/Project/components/NoFleetProjectAlert/styles.module.scss @@ -0,0 +1,10 @@ +.alertBox { + :global { + & [class^="awsui_alert"] { + & [class^="awsui_action-slot"] { + display: flex; + align-items: center; + } + } + } +} diff --git a/frontend/src/pages/Runs/CreateDevEnvironment/index.tsx b/frontend/src/pages/Runs/CreateDevEnvironment/index.tsx index 40e14c814d..278bc5b3c5 100644 --- a/frontend/src/pages/Runs/CreateDevEnvironment/index.tsx +++ b/frontend/src/pages/Runs/CreateDevEnvironment/index.tsx @@ -7,15 +7,17 @@ import * as yup from 'yup'; import { Box, Link, WizardProps } from '@cloudscape-design/components'; import { CardsProps } from '@cloudscape-design/components/cards'; -import type { TabsProps, ToggleProps } from 'components'; +import { TabsProps, ToggleProps } from 'components'; import { Container, FormCodeEditor, FormField, FormInput, FormSelect, SpaceBetween, Tabs, Toggle, Wizard } from 'components'; import { useBreadcrumbs, useNotifications } from 'hooks'; +import { useCheckingForFleetsInProjects } from 'hooks/useCheckingForFleetsInProjectsOfMember'; import { getServerError } from 'libs'; import { ROUTES } from 'routes'; import { useApplyRunMutation } from 'services/run'; import { OfferList } from 'pages/Offers/List'; +import { NoFleetProjectAlert } from 'pages/Project/components/NoFleetProjectAlert'; import { useGenerateYaml } from './hooks/useGenerateYaml'; import { useGetRunSpecFromYaml } from './hooks/useGetRunSpecFromYaml'; @@ -117,6 +119,9 @@ export const CreateDevEnvironment: React.FC = () => { const [getRunSpecFromYaml] = useGetRunSpecFromYaml({ projectName: selectedProject ?? '' }); + const projectHavingFleetMap = useCheckingForFleetsInProjects({ projectNames: selectedProject ? [selectedProject] : [] }); + const projectDontHasFleets = !!selectedProject && !projectHavingFleetMap[selectedProject]; + const [applyRun, { isLoading: isApplying }] = useApplyRunMutation(); const loading = isApplying; @@ -174,6 +179,10 @@ export const CreateDevEnvironment: React.FC = () => { const stepValidators = [validateOffer, validateSecondStep, validateConfig]; if (reason === 'next') { + if (projectDontHasFleets) { + window.scrollTo(0, 0); + } + stepValidators[activeStepIndex]?.().then((isValid) => { if (isValid) { setActiveStepIndex(requestedStepIndex); @@ -277,6 +286,12 @@ export const CreateDevEnvironment: React.FC = () => { return (
+ + { localStorePrefix: 'administration-run-list-page', }); + const projectHavingFleetMap = useCheckingForFleetsInProjects({}); + const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll({ useLazyQuery: useLazyGetRunsQuery, args: { ...filteringRequestParams, limit: DEFAULT_TABLE_PAGE_SIZE, job_submissions_limit: 1 }, @@ -117,6 +122,8 @@ export const RunList: React.FC = () => { } }; + const projectDontHasFleet = Object.keys(projectHavingFleetMap).find((project) => !projectHavingFleetMap[project]); + return (
{ columnDisplay={preferences.contentDisplay} preferences={} header={ -
- - {t('common.new')} - - - - - - - {/**/} - -
+ <> + + +
+ + {t('common.new')} + + + + + + + {/**/} + +
+ } filter={
diff --git a/frontend/src/pages/Runs/List/styles.module.scss b/frontend/src/pages/Runs/List/styles.module.scss index 0b5efa7b66..0598087317 100644 --- a/frontend/src/pages/Runs/List/styles.module.scss +++ b/frontend/src/pages/Runs/List/styles.module.scss @@ -1,3 +1,7 @@ +.noFleetAlert { + margin-bottom: 12px; +} + .selectFilters { display: flex; flex-wrap: wrap; diff --git a/frontend/src/services/fleet.ts b/frontend/src/services/fleet.ts index e74753b3ce..3405a18b8b 100644 --- a/frontend/src/services/fleet.ts +++ b/frontend/src/services/fleet.ts @@ -69,4 +69,4 @@ export const fleetApi = createApi({ }), }); -export const { useLazyGetFleetsQuery, useDeleteFleetMutation, useGetFleetDetailsQuery } = fleetApi; +export const { useGetFleetsQuery, useLazyGetFleetsQuery, useDeleteFleetMutation, useGetFleetDetailsQuery } = fleetApi; diff --git a/frontend/src/services/project.ts b/frontend/src/services/project.ts index 1dfbe25ef5..2f0a4bd6b5 100644 --- a/frontend/src/services/project.ts +++ b/frontend/src/services/project.ts @@ -20,7 +20,7 @@ export const projectApi = createApi({ prepareHeaders: fetchBaseQueryHeaders, }), - tagTypes: ['Projects', 'ProjectRepos', 'ProjectLogs', 'Backends'], + tagTypes: ['Projects', 'NoFleetsProject', 'ProjectRepos', 'ProjectLogs', 'Backends'], endpoints: (builder) => ({ getProjects: builder.query({ @@ -40,6 +40,27 @@ export const projectApi = createApi({ : ['Projects'], }), + getOnlyNoFleetsProjects: builder.query({ + query: (body) => { + return { + url: API.PROJECTS.LIST_ONLY_NO_FLEETS(), + method: 'POST', + body, + }; + }, + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + transformResponse: (response: any[]): IProject[] => response.map(transformProjectResponse), + + providesTags: (result) => + result + ? [ + ...result.map(({ project_name }) => ({ type: 'NoFleetsProject' as const, id: project_name })), + 'NoFleetsProject', + ] + : ['NoFleetsProject'], + }), + getProject: builder.query({ query: ({ name }) => { return { @@ -180,6 +201,7 @@ export const projectApi = createApi({ export const { useGetProjectsQuery, + useGetOnlyNoFleetsProjectsQuery, useLazyGetProjectsQuery, useGetProjectQuery, useCreateProjectMutation, diff --git a/frontend/src/types/project.d.ts b/frontend/src/types/project.d.ts index c4a5cd0a55..cf24c84d03 100644 --- a/frontend/src/types/project.d.ts +++ b/frontend/src/types/project.d.ts @@ -7,6 +7,11 @@ declare type TCreateWizardProjectParams = { }; }; +declare type TGetProjectsParams = { + only_no_fleets?: boolean; + include_not_joined?: boolean; +}; + declare type TProjectBackend = { name: string; config: IBackendAWS | IBackendAzure | IBackendGCP | IBackendLambda | IBackendLocal | IBackendDstack; diff --git a/src/dstack/_internal/server/routers/projects.py b/src/dstack/_internal/server/routers/projects.py index d35b9535e8..b07b7b1c62 100644 --- a/src/dstack/_internal/server/routers/projects.py +++ b/src/dstack/_internal/server/routers/projects.py @@ -23,7 +23,7 @@ ProjectManagerOrSelfLeave, ProjectMemberOrPublicAccess, ) -from dstack._internal.server.services import projects +from dstack._internal.server.services import fleets, projects from dstack._internal.server.utils.routers import ( CustomORJSONResponse, get_base_api_additional_responses, @@ -43,7 +43,10 @@ async def list_projects( user: UserModel = Depends(Authenticated()), ): """ - Returns all projects visible to user sorted by descending `created_at`. + Returns projects visible to the user, sorted by ascending `created_at`. + + Returns all accessible projects (member projects for regular users, all non-deleted + projects for global admins, plus public projects if `include_not_joined` is `True`). `members` and `backends` are always empty - call `/api/projects/{project_name}/get` to retrieve them. """ @@ -57,6 +60,25 @@ async def list_projects( ) +@router.post("/list_only_no_fleets", response_model=List[Project]) +async def list_only_no_fleets( + session: AsyncSession = Depends(get_session), + user: UserModel = Depends(Authenticated()), +): + """ + Returns only projects where the user is a member and that have no active fleets, + sorted by ascending `created_at`. + + Active fleets are those with `deleted == False`. Projects with deleted fleets + (but no active fleets) are included. + + `members` and `backends` are always empty - call `/api/projects/{project_name}/get` to retrieve them. + """ + return CustomORJSONResponse( + await fleets.list_projects_with_no_active_fleets(session=session, user=user) + ) + + @router.post("/create", response_model=Project) async def create_project( body: CreateProjectRequest, diff --git a/src/dstack/_internal/server/services/fleets.py b/src/dstack/_internal/server/services/fleets.py index 16901bdf1a..e347829fa4 100644 --- a/src/dstack/_internal/server/services/fleets.py +++ b/src/dstack/_internal/server/services/fleets.py @@ -6,7 +6,7 @@ from sqlalchemy import and_, func, or_, select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload, selectinload +from sqlalchemy.orm import aliased, joinedload, selectinload from dstack._internal.core.backends.base.backend import Backend from dstack._internal.core.backends.features import BACKENDS_WITH_CREATE_INSTANCE_SUPPORT @@ -40,6 +40,7 @@ Profile, SpotPolicy, ) +from dstack._internal.core.models.projects import Project from dstack._internal.core.models.resources import ResourcesSpec from dstack._internal.core.models.runs import JobProvisioningData, Requirements, get_policy_map from dstack._internal.core.models.users import GlobalRole @@ -50,6 +51,7 @@ FleetModel, InstanceModel, JobModel, + MemberModel, ProjectModel, UserModel, ) @@ -70,6 +72,7 @@ get_member, get_member_permissions, list_user_project_models, + project_model_to_project, ) from dstack._internal.server.services.resources import set_resources_defaults from dstack._internal.utils import random_names @@ -98,6 +101,53 @@ def switch_fleet_status( events.emit(session, msg, actor=actor, targets=[events.Target.from_model(fleet_model)]) +async def list_projects_with_no_active_fleets( + session: AsyncSession, + user: UserModel, +) -> List[Project]: + """ + Returns all projects where the user is a member that have no active fleets. + + Active fleets are those with `deleted == False`. Projects with only deleted fleets + (or no fleets) are included. Deleted projects are excluded. + + Applies to all users (both regular users and admins require membership). + """ + active_fleet_alias = aliased(FleetModel) + member_alias = aliased(MemberModel) + + query = ( + select(ProjectModel) + .join( + member_alias, + and_( + member_alias.project_id == ProjectModel.id, + member_alias.user_id == user.id, + ), + ) + .outerjoin( + active_fleet_alias, + and_( + active_fleet_alias.project_id == ProjectModel.id, + active_fleet_alias.deleted == False, + ), + ) + .where( + ProjectModel.deleted == False, + active_fleet_alias.id.is_(None), + ) + .order_by(ProjectModel.created_at) + ) + + res = await session.execute(query) + project_models = list(res.scalars().unique().all()) + + return [ + project_model_to_project(p, include_backends=False, include_members=False) + for p in project_models + ] + + async def list_fleets( session: AsyncSession, user: UserModel, diff --git a/src/tests/_internal/server/routers/test_projects.py b/src/tests/_internal/server/routers/test_projects.py index 4b62ac416d..5c9ef42ffb 100644 --- a/src/tests/_internal/server/routers/test_projects.py +++ b/src/tests/_internal/server/routers/test_projects.py @@ -34,6 +34,14 @@ async def test_returns_40x_if_not_authenticated(self, test_db, client: AsyncClie response = await client.post("/api/projects/list") assert response.status_code in [401, 403] + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_list_only_no_fleets_returns_40x_if_not_authenticated( + self, test_db, client: AsyncClient + ): + response = await client.post("/api/projects/list_only_no_fleets") + assert response.status_code in [401, 403] + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_returns_empty_list(self, test_db, session: AsyncSession, client: AsyncClient): @@ -208,6 +216,495 @@ async def test_member_sees_both_public_and_private_projects( assert "public_project" in project_names assert "private_project" in project_names + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_returns_projects_without_active_fleets( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + + # Create project with no fleets + project_no_fleets = await create_project( + session=session, + owner=user, + name="project_no_fleets", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_no_fleets, user=user, project_role=ProjectRole.ADMIN + ) + + # Create project with active fleet + project_with_active_fleet = await create_project( + session=session, + owner=user, + name="project_with_active_fleet", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_with_active_fleet, + user=user, + project_role=ProjectRole.ADMIN, + ) + await create_fleet( + session=session, + project=project_with_active_fleet, + deleted=False, + ) + + # Create project with deleted fleet (should be included) + project_with_deleted_fleet = await create_project( + session=session, + owner=user, + name="project_with_deleted_fleet", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_with_deleted_fleet, + user=user, + project_role=ProjectRole.ADMIN, + ) + deleted_fleet = await create_fleet( + session=session, + project=project_with_deleted_fleet, + deleted=True, + ) + deleted_fleet.status = FleetStatus.TERMINATED + await session.commit() + + # Test with list_only_no_fleets endpoint + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + + # Should only return projects without active fleets + assert len(projects) == 2 + project_names = {p["project_name"] for p in projects} + assert "project_no_fleets" in project_names + assert "project_with_deleted_fleet" in project_names + assert "project_with_active_fleet" not in project_names + + # Test with regular list endpoint (default) + response = await client.post( + "/api/projects/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + + # Should return all projects + assert len(projects) == 3 + project_names = {p["project_name"] for p in projects} + assert "project_no_fleets" in project_names + assert "project_with_active_fleet" in project_names + assert "project_with_deleted_fleet" in project_names + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_with_multiple_fleets( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test project with multiple fleets - some active, some deleted""" + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + + # Create project with both active and deleted fleets + project_mixed = await create_project( + session=session, + owner=user, + name="project_mixed", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_mixed, user=user, project_role=ProjectRole.ADMIN + ) + # Add active fleet - should exclude project + await create_fleet( + session=session, + project=project_mixed, + deleted=False, + ) + # Add deleted fleet - should not affect exclusion + deleted_fleet = await create_fleet( + session=session, + project=project_mixed, + deleted=True, + ) + deleted_fleet.status = FleetStatus.TERMINATED + await session.commit() + + # Project should NOT be included because it has an active fleet + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + project_names = {p["project_name"] for p in projects} + assert "project_mixed" not in project_names + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_empty_result( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test when all projects have active fleets""" + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + + # Create projects, all with active fleets + for i in range(3): + project = await create_project( + session=session, + owner=user, + name=f"project_{i}", + created_at=datetime(2023, 1, 2, 3, 4 + i, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + await create_fleet( + session=session, + project=project, + deleted=False, + ) + + # Should return empty list + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + assert len(projects) == 0 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_respects_user_permissions( + self, test_db, session: AsyncSession, client: AsyncClient + ): + # Create regular user (not admin) + user = await create_user(session=session, global_role=GlobalRole.USER) + + # Create another user + owner = await create_user(session=session, name="owner", global_role=GlobalRole.USER) + + # Create project where user is a member (no fleets) + project_member = await create_project( + session=session, + owner=owner, + name="project_member", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_member, user=user, project_role=ProjectRole.USER + ) + await add_project_member( + session=session, project=project_member, user=owner, project_role=ProjectRole.ADMIN + ) + + # Create public project where user is NOT a member (no fleets) + public_project = await create_project( + session=session, + owner=owner, + name="public_project", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + is_public=True, + ) + await add_project_member( + session=session, project=public_project, user=owner, project_role=ProjectRole.ADMIN + ) + + # Create private project where user is NOT a member (should not see this) + private_project = await create_project( + session=session, + owner=owner, + name="private_project", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + is_public=False, + ) + await add_project_member( + session=session, project=private_project, user=owner, project_role=ProjectRole.ADMIN + ) + + # Test with list_only_no_fleets endpoint + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + + # Should only return member projects without active fleets + # (public projects where user is not a member are no longer included) + assert len(projects) == 1 + project_names = {p["project_name"] for p in projects} + assert "project_member" in project_names + assert "public_project" not in project_names + assert "private_project" not in project_names + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_regular_user_filters_active_fleets( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test that regular users correctly filter out projects with active fleets""" + # Create regular user (not admin) + user = await create_user(session=session, global_role=GlobalRole.USER) + + # Create another user + owner = await create_user(session=session, name="owner", global_role=GlobalRole.USER) + + # Create member project with no fleets (should be included) + project_member_no_fleet = await create_project( + session=session, + owner=owner, + name="project_member_no_fleet", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_member_no_fleet, + user=user, + project_role=ProjectRole.USER, + ) + + # Create member project with active fleet (should be excluded) + project_member_with_fleet = await create_project( + session=session, + owner=owner, + name="project_member_with_fleet", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_member_with_fleet, + user=user, + project_role=ProjectRole.USER, + ) + await create_fleet( + session=session, + project=project_member_with_fleet, + deleted=False, + ) + + # Create public project where user is a member with no fleets (should be included) + public_project_no_fleet = await create_project( + session=session, + owner=owner, + name="public_project_no_fleet", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + is_public=True, + ) + await add_project_member( + session=session, + project=public_project_no_fleet, + user=user, + project_role=ProjectRole.USER, + ) + + # Create public project where user is a member with active fleet (should be excluded) + public_project_with_fleet = await create_project( + session=session, + owner=owner, + name="public_project_with_fleet", + created_at=datetime(2023, 1, 2, 3, 7, tzinfo=timezone.utc), + is_public=True, + ) + await add_project_member( + session=session, + project=public_project_with_fleet, + user=user, + project_role=ProjectRole.USER, + ) + await create_fleet( + session=session, + project=public_project_with_fleet, + deleted=False, + ) + + # Test with list_only_no_fleets endpoint + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + + # Should only return member projects without active fleets + assert len(projects) == 2 + project_names = {p["project_name"] for p in projects} + assert "project_member_no_fleet" in project_names + assert "public_project_no_fleet" in project_names + assert "project_member_with_fleet" not in project_names + assert "public_project_with_fleet" not in project_names + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_filters_active_fleets_correctly( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test that projects with active fleets are correctly filtered out""" + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + + # Create project with active fleet + project_with_active = await create_project( + session=session, + owner=user, + name="project_with_active", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_with_active, user=user, project_role=ProjectRole.ADMIN + ) + active_fleet = await create_fleet( + session=session, + project=project_with_active, + deleted=False, + ) + active_fleet.status = FleetStatus.ACTIVE + await session.commit() + + # Create project with terminated but not deleted fleet (still active) + project_with_terminated = await create_project( + session=session, + owner=user, + name="project_with_terminated", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_with_terminated, + user=user, + project_role=ProjectRole.ADMIN, + ) + terminated_fleet = await create_fleet( + session=session, + project=project_with_terminated, + deleted=False, + ) + terminated_fleet.status = FleetStatus.TERMINATED + await session.commit() + + # Both should be excluded + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + project_names = {p["project_name"] for p in projects} + assert "project_with_active" not in project_names + assert "project_with_terminated" not in project_names + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_sorted_by_created_at( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test that results are sorted by created_at""" + user = await create_user(session=session, global_role=GlobalRole.ADMIN) + + # Create projects in reverse order + project_3 = await create_project( + session=session, + owner=user, + name="project_3", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_3, user=user, project_role=ProjectRole.ADMIN + ) + + project_1 = await create_project( + session=session, + owner=user, + name="project_1", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_1, user=user, project_role=ProjectRole.ADMIN + ) + + project_2 = await create_project( + session=session, + owner=user, + name="project_2", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, project=project_2, user=user, project_role=ProjectRole.ADMIN + ) + + # Results should be sorted by created_at ascending + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + projects = response.json() + assert len(projects) == 3 + assert projects[0]["project_name"] == "project_1" + assert projects[1]["project_name"] == "project_2" + assert projects[2]["project_name"] == "project_3" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_only_no_fleets_admin_requires_membership( + self, test_db, session: AsyncSession, client: AsyncClient + ): + """Test that admins also require membership (unified behavior)""" + # Create admin user + admin = await create_user(session=session, global_role=GlobalRole.ADMIN) + + # Create another user + owner = await create_user(session=session, name="owner", global_role=GlobalRole.USER) + + # Create project where admin is a member (no fleets) - should be included + project_with_membership = await create_project( + session=session, + owner=owner, + name="project_with_membership", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_with_membership, + user=admin, + project_role=ProjectRole.ADMIN, + ) + + # Create project where admin is NOT a member (no fleets) - should NOT be included + project_without_membership = await create_project( + session=session, + owner=owner, + name="project_without_membership", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + await add_project_member( + session=session, + project=project_without_membership, + user=owner, + project_role=ProjectRole.ADMIN, + ) + + # Test with list_only_no_fleets endpoint + response = await client.post( + "/api/projects/list_only_no_fleets", + headers=get_auth_headers(admin.token), + ) + assert response.status_code == 200 + projects = response.json() + + # Should only return project where admin is a member + assert len(projects) == 1 + project_names = {p["project_name"] for p in projects} + assert "project_with_membership" in project_names + assert "project_without_membership" not in project_names + class TestCreateProject: @pytest.mark.asyncio From 3e931d90bd2c717b09be9c719254e41929034f03 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Tue, 30 Dec 2025 15:06:55 +0500 Subject: [PATCH 035/187] Make no fleet notifications dismissible (#3439) --- frontend/src/pages/Fleets/List/index.tsx | 1 + frontend/src/pages/Runs/List/index.tsx | 1 + 2 files changed, 2 insertions(+) diff --git a/frontend/src/pages/Fleets/List/index.tsx b/frontend/src/pages/Fleets/List/index.tsx index e3e2dfc234..0a29192e0c 100644 --- a/frontend/src/pages/Fleets/List/index.tsx +++ b/frontend/src/pages/Fleets/List/index.tsx @@ -90,6 +90,7 @@ export const FleetList: React.FC = () => { className={styles.noFleetAlert} projectName={projectDontHasFleet ?? ''} show={!!projectDontHasFleet} + dismissible={true} />
{ className={styles.noFleetAlert} projectName={projectDontHasFleet ?? ''} show={!!projectDontHasFleet} + dismissible={true} />
Date: Tue, 30 Dec 2025 15:54:47 +0500 Subject: [PATCH 036/187] Adjust kubernetes gpu matching for RTX5090 (#3440) --- src/dstack/_internal/core/backends/kubernetes/compute.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dstack/_internal/core/backends/kubernetes/compute.py b/src/dstack/_internal/core/backends/kubernetes/compute.py index e46b99d9d7..53feb9cda5 100644 --- a/src/dstack/_internal/core/backends/kubernetes/compute.py +++ b/src/dstack/_internal/core/backends/kubernetes/compute.py @@ -574,6 +574,7 @@ def _get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]: gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL) if gpu_product is None: return None + gpu_product = gpu_product.replace("RTX-", "RTX") for gpu_name in NVIDIA_GPU_NAMES: if gpu_name.lower() in gpu_product.lower().split("-"): break From 78d26ee294bf7daaf74ad78cfe9b968dbaf0c8ce Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Tue, 6 Jan 2026 09:07:13 +0000 Subject: [PATCH 037/187] [runner] Fix MPI hostfile (#3441) * Don't set slots on CPU nodes * Move the file to /dstack/mpi and make it world-readable Fixes: https://github.com/dstackai/dstack/issues/3434 Fixes: https://github.com/dstackai/dstack/issues/3436 --- runner/cmd/runner/main.go | 18 ++++++--- runner/cmd/shim/main.go | 4 +- runner/consts/consts.go | 10 ++++- runner/internal/executor/executor.go | 39 +++++++++++------- runner/internal/executor/executor_test.go | 48 ++++++++++++----------- runner/internal/runner/api/server.go | 7 +++- 6 files changed, 77 insertions(+), 49 deletions(-) diff --git a/runner/cmd/runner/main.go b/runner/cmd/runner/main.go index 7b3bb84680..a080246d41 100644 --- a/runner/cmd/runner/main.go +++ b/runner/cmd/runner/main.go @@ -7,6 +7,7 @@ import ( "io" "os" "os/signal" + "path" "path/filepath" "syscall" @@ -121,27 +122,32 @@ func start(ctx context.Context, tempDir string, homeDir string, httpPort int, ss log.DefaultEntry.Logger.SetOutput(io.MultiWriter(os.Stdout, defaultLogFile)) log.DefaultEntry.Logger.SetLevel(logrus.Level(logLevel)) + // NB: The Mkdir/Chown/Chmod code below relies on the fact that RunnerDstackDir path is _not_ nested (/dstack). + // Adjust it if the path is changed to, e.g., /opt/dstack + const dstackDir = consts.RunnerDstackDir + dstackSshDir := path.Join(dstackDir, "ssh") + // To ensure that all components of the authorized_keys path are owned by root and no directories // are group or world writable, as required by sshd with "StrictModes yes" (the default value), // we fix `/dstack` ownership and permissions and remove `/dstack/ssh` (it will be (re)created // in Sshd.Prepare()) // See: https://github.com/openssh/openssh-portable/blob/d01efaa1c9ed84fd9011201dbc3c7cb0a82bcee3/misc.c#L2257-L2272 - if err := os.Mkdir("/dstack", 0o755); errors.Is(err, os.ErrExist) { - if err := os.Chown("/dstack", 0, 0); err != nil { + if err := os.Mkdir(dstackDir, 0o755); errors.Is(err, os.ErrExist) { + if err := os.Chown(dstackDir, 0, 0); err != nil { return fmt.Errorf("chown dstack dir: %w", err) } - if err := os.Chmod("/dstack", 0o755); err != nil { + if err := os.Chmod(dstackDir, 0o755); err != nil { return fmt.Errorf("chmod dstack dir: %w", err) } } else if err != nil { return fmt.Errorf("create dstack dir: %w", err) } - if err := os.RemoveAll("/dstack/ssh"); err != nil { + if err := os.RemoveAll(dstackSshDir); err != nil { return fmt.Errorf("remove dstack ssh dir: %w", err) } sshd := ssh.NewSshd("/usr/sbin/sshd") - if err := sshd.Prepare(ctx, "/dstack/ssh", sshPort, "INFO"); err != nil { + if err := sshd.Prepare(ctx, dstackSshDir, sshPort, "INFO"); err != nil { return fmt.Errorf("prepare sshd: %w", err) } if err := sshd.AddAuthorizedKeys(ctx, sshAuthorizedKeys...); err != nil { @@ -156,7 +162,7 @@ func start(ctx context.Context, tempDir string, homeDir string, httpPort int, ss } }() - server, err := api.NewServer(ctx, tempDir, homeDir, fmt.Sprintf(":%d", httpPort), sshd, version) + server, err := api.NewServer(ctx, tempDir, homeDir, dstackDir, sshd, fmt.Sprintf(":%d", httpPort), version) if err != nil { return fmt.Errorf("create server: %w", err) } diff --git a/runner/cmd/shim/main.go b/runner/cmd/shim/main.go index 4c3c951df2..644d7e80e8 100644 --- a/runner/cmd/shim/main.go +++ b/runner/cmd/shim/main.go @@ -56,7 +56,7 @@ func mainInner() int { Usage: "Set shim's home directory", Destination: &args.Shim.HomeDir, TakesFile: true, - DefaultText: path.Join("~", consts.DstackDirPath), + DefaultText: path.Join("~", consts.DstackUserDir), Sources: cli.EnvVars("DSTACK_SHIM_HOME"), }, &cli.StringFlag{ @@ -187,7 +187,7 @@ func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error) if err != nil { return err } - shimHomeDir = filepath.Join(home, consts.DstackDirPath) + shimHomeDir = filepath.Join(home, consts.DstackUserDir) args.Shim.HomeDir = shimHomeDir } diff --git a/runner/consts/consts.go b/runner/consts/consts.go index 2c392b5ee4..4da4a139f7 100644 --- a/runner/consts/consts.go +++ b/runner/consts/consts.go @@ -1,6 +1,7 @@ package consts -const DstackDirPath string = ".dstack" +// A directory inside user's home used for dstack-related files +const DstackUserDir string = ".dstack" // Runner's log filenames const ( @@ -29,6 +30,13 @@ const ( // The current user's homedir (as of 2024-12-28, it's always root) should be used // instead of the hardcoded value RunnerHomeDir = "/root" + // A directory for: + // 1. Files used by the runner and related components (e.g., sshd stores its config and log inside /dstack/ssh) + // 2. Files shared between users (e.g., sshd authorized_keys, MPI hostfile) + // The inner structure should be considered private and subject to change, the users should not make assumptions + // about its structure. + // The only way to access its content/paths should be via public environment variables such as DSTACK_MPI_HOSTFILE. + RunnerDstackDir = "/dstack" ) const ( diff --git a/runner/internal/executor/executor.go b/runner/internal/executor/executor.go index 3e486f9704..56a5d1cd9f 100644 --- a/runner/internal/executor/executor.go +++ b/runner/internal/executor/executor.go @@ -54,6 +54,7 @@ type ConnectionTracker interface { type RunExecutor struct { tempDir string homeDir string + dstackDir string archiveDir string sshd ssh.SshdManager @@ -91,7 +92,7 @@ func (s *stubConnectionTracker) GetNoConnectionsSecs() int64 { return 0 } func (s *stubConnectionTracker) Track(ticker <-chan time.Time) {} func (s *stubConnectionTracker) Stop() {} -func NewRunExecutor(tempDir string, homeDir string, sshd ssh.SshdManager) (*RunExecutor, error) { +func NewRunExecutor(tempDir string, homeDir string, dstackDir string, sshd ssh.SshdManager) (*RunExecutor, error) { mu := &sync.RWMutex{} timestamp := NewMonotonicTimestamp() user, err := osuser.Current() @@ -124,6 +125,7 @@ func NewRunExecutor(tempDir string, homeDir string, sshd ssh.SshdManager) (*RunE return &RunExecutor{ tempDir: tempDir, homeDir: homeDir, + dstackDir: dstackDir, archiveDir: filepath.Join(tempDir, "file_archives"), sshd: sshd, currentUid: uid, @@ -384,12 +386,12 @@ func (ex *RunExecutor) getRepoData() schemas.RepoData { } func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error { - node_rank := ex.jobSpec.JobNum - nodes_num := ex.jobSpec.JobsPerReplica - gpus_per_node_num := ex.clusterInfo.GPUSPerJob - gpus_num := nodes_num * gpus_per_node_num + nodeRank := ex.jobSpec.JobNum + nodesNum := ex.jobSpec.JobsPerReplica + gpusPerNodeNum := ex.clusterInfo.GPUSPerJob + gpusNum := nodesNum * gpusPerNodeNum - mpiHostfilePath := filepath.Join(ex.homeDir, ".dstack/mpi/hostfile") + mpiHostfilePath := filepath.Join(ex.dstackDir, "mpi/hostfile") jobEnvs := map[string]string{ "DSTACK_RUN_ID": ex.run.Id, @@ -400,10 +402,10 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error "DSTACK_WORKING_DIR": ex.jobWorkingDir, "DSTACK_NODES_IPS": strings.Join(ex.clusterInfo.JobIPs, "\n"), "DSTACK_MASTER_NODE_IP": ex.clusterInfo.MasterJobIP, - "DSTACK_NODE_RANK": strconv.Itoa(node_rank), - "DSTACK_NODES_NUM": strconv.Itoa(nodes_num), - "DSTACK_GPUS_PER_NODE": strconv.Itoa(gpus_per_node_num), - "DSTACK_GPUS_NUM": strconv.Itoa(gpus_num), + "DSTACK_NODE_RANK": strconv.Itoa(nodeRank), + "DSTACK_NODES_NUM": strconv.Itoa(nodesNum), + "DSTACK_GPUS_PER_NODE": strconv.Itoa(gpusPerNodeNum), + "DSTACK_GPUS_NUM": strconv.Itoa(gpusNum), "DSTACK_MPI_HOSTFILE": mpiHostfilePath, } @@ -460,7 +462,7 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error envMap.Update(ex.jobSpec.Env, false) const profilePath = "/etc/profile" - const dstackProfilePath = "/dstack/profile" + dstackProfilePath := path.Join(ex.dstackDir, "profile") if err := writeDstackProfile(envMap, dstackProfilePath); err != nil { log.Warning(ctx, "failed to write dstack_profile", "path", dstackProfilePath, "err", err) } else if err := includeDstackProfile(profilePath, dstackProfilePath); err != nil { @@ -508,7 +510,7 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error } } - err = writeMpiHostfile(ctx, ex.clusterInfo.JobIPs, gpus_per_node_num, mpiHostfilePath) + err = writeMpiHostfile(ctx, ex.clusterInfo.JobIPs, gpusPerNodeNum, mpiHostfilePath) if err != nil { return fmt.Errorf("write MPI hostfile: %w", err) } @@ -839,7 +841,7 @@ func prepareSSHDir(uid int, gid int, homeDir string) (string, error) { return sshDir, nil } -func writeMpiHostfile(ctx context.Context, ips []string, gpus_per_node int, path string) error { +func writeMpiHostfile(ctx context.Context, ips []string, gpusPerNode int, path string) error { if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { return fmt.Errorf("create MPI hostfile directory: %w", err) } @@ -855,9 +857,16 @@ func writeMpiHostfile(ctx context.Context, ips []string, gpus_per_node int, path } } if len(nonEmptyIps) == len(ips) { + var template string + if gpusPerNode == 0 { + // CPU node: the number of slots defaults to the number of processor cores on that host + // See: https://docs.open-mpi.org/en/main/launching-apps/scheduling.html#calculating-the-number-of-slots + template = "%s\n" + } else { + template = fmt.Sprintf("%%s slots=%d\n", gpusPerNode) + } for _, ip := range nonEmptyIps { - line := fmt.Sprintf("%s slots=%d\n", ip, gpus_per_node) - if _, err = file.WriteString(line); err != nil { + if _, err = fmt.Fprintf(file, template, ip); err != nil { return fmt.Errorf("write MPI hostfile line: %w", err) } } diff --git a/runner/internal/executor/executor_test.go b/runner/internal/executor/executor_test.go index cc5cae7b38..e3661fac0e 100644 --- a/runner/internal/executor/executor_test.go +++ b/runner/internal/executor/executor_test.go @@ -28,13 +28,13 @@ func TestExecutor_WorkingDir_Set(t *testing.T) { ex.jobSpec.WorkingDir = &workingDir ex.jobSpec.Commands = append(ex.jobSpec.Commands, "pwd") - err = ex.setJobWorkingDir(context.TODO()) + err = ex.setJobWorkingDir(t.Context()) require.NoError(t, err) require.Equal(t, workingDir, ex.jobWorkingDir) err = os.MkdirAll(workingDir, 0o755) require.NoError(t, err) - err = ex.execJob(context.TODO(), io.Writer(&b)) + err = ex.execJob(t.Context(), io.Writer(&b)) assert.NoError(t, err) // Normalize line endings for cross-platform compatibility. assert.Equal(t, workingDir+"\n", strings.ReplaceAll(b.String(), "\r\n", "\n")) @@ -47,11 +47,11 @@ func TestExecutor_WorkingDir_NotSet(t *testing.T) { require.NoError(t, err) ex.jobSpec.WorkingDir = nil ex.jobSpec.Commands = append(ex.jobSpec.Commands, "pwd") - err = ex.setJobWorkingDir(context.TODO()) + err = ex.setJobWorkingDir(t.Context()) require.NoError(t, err) require.Equal(t, cwd, ex.jobWorkingDir) - err = ex.execJob(context.TODO(), io.Writer(&b)) + err = ex.execJob(t.Context(), io.Writer(&b)) assert.NoError(t, err) assert.Equal(t, cwd+"\n", strings.ReplaceAll(b.String(), "\r\n", "\n")) } @@ -61,7 +61,7 @@ func TestExecutor_HomeDir(t *testing.T) { ex := makeTestExecutor(t) ex.jobSpec.Commands = append(ex.jobSpec.Commands, "echo ~") - err := ex.execJob(context.TODO(), io.Writer(&b)) + err := ex.execJob(t.Context(), io.Writer(&b)) assert.NoError(t, err) assert.Equal(t, ex.homeDir+"\n", strings.ReplaceAll(b.String(), "\r\n", "\n")) } @@ -71,7 +71,7 @@ func TestExecutor_NonZeroExit(t *testing.T) { ex.jobSpec.Commands = append(ex.jobSpec.Commands, "exit 100") makeCodeTar(t, ex.codePath) - err := ex.Run(context.TODO()) + err := ex.Run(t.Context()) assert.Error(t, err) assert.NotEmpty(t, ex.jobStateHistory) exitStatus := ex.jobStateHistory[len(ex.jobStateHistory)-1].ExitStatus @@ -90,11 +90,11 @@ func TestExecutor_SSHCredentials(t *testing.T) { PrivateKey: &key, } - clean, err := ex.setupCredentials(context.TODO()) + clean, err := ex.setupCredentials(t.Context()) defer clean() require.NoError(t, err) - err = ex.execJob(context.TODO(), io.Writer(&b)) + err = ex.execJob(t.Context(), io.Writer(&b)) assert.NoError(t, err) assert.Equal(t, key, b.String()) } @@ -106,10 +106,10 @@ func TestExecutor_LocalRepo(t *testing.T) { ex.jobSpec.Commands = append(ex.jobSpec.Commands, cmd) makeCodeTar(t, ex.codePath) - err := ex.setupRepo(context.TODO()) + err := ex.setupRepo(t.Context()) require.NoError(t, err) - err = ex.execJob(context.TODO(), io.Writer(&b)) + err = ex.execJob(t.Context(), io.Writer(&b)) assert.NoError(t, err) assert.Equal(t, "bar\n", strings.ReplaceAll(b.String(), "\r\n", "\n")) } @@ -119,7 +119,7 @@ func TestExecutor_Recover(t *testing.T) { ex.jobSpec.Commands = nil // cause a panic makeCodeTar(t, ex.codePath) - err := ex.Run(context.TODO()) + err := ex.Run(t.Context()) assert.ErrorContains(t, err, "recovered: ") } @@ -136,7 +136,7 @@ func TestExecutor_MaxDuration(t *testing.T) { ex.jobSpec.MaxDuration = 1 // seconds makeCodeTar(t, ex.codePath) - err := ex.Run(context.TODO()) + err := ex.Run(t.Context()) assert.ErrorContains(t, err, "killed") } @@ -158,12 +158,12 @@ func TestExecutor_RemoteRepo(t *testing.T) { err := os.WriteFile(ex.codePath, []byte{}, 0o600) // empty diff require.NoError(t, err) - err = ex.setJobWorkingDir(context.TODO()) + err = ex.setJobWorkingDir(t.Context()) require.NoError(t, err) - err = ex.setupRepo(context.TODO()) + err = ex.setupRepo(t.Context()) require.NoError(t, err) - err = ex.execJob(context.TODO(), io.Writer(&b)) + err = ex.execJob(t.Context(), io.Writer(&b)) assert.NoError(t, err) expected := fmt.Sprintf("%s\n%s\n%s\n", ex.getRepoData().RepoHash, ex.getRepoData().RepoConfigName, ex.getRepoData().RepoConfigEmail) assert.Equal(t, expected, strings.ReplaceAll(b.String(), "\r\n", "\n")) @@ -204,11 +204,13 @@ func makeTestExecutor(t *testing.T) *RunExecutor { }, } - temp := filepath.Join(baseDir, "temp") - _ = os.Mkdir(temp, 0o700) - home := filepath.Join(baseDir, "home") - _ = os.Mkdir(home, 0o700) - ex, _ := NewRunExecutor(temp, home, new(sshdMock)) + tempDir := filepath.Join(baseDir, "temp") + require.NoError(t, os.Mkdir(tempDir, 0o700)) + homeDir := filepath.Join(baseDir, "home") + require.NoError(t, os.Mkdir(homeDir, 0o700)) + dstackDir := filepath.Join(baseDir, "dstack") + require.NoError(t, os.Mkdir(dstackDir, 0o755)) + ex, _ := NewRunExecutor(tempDir, homeDir, dstackDir, new(sshdMock)) ex.SetJob(body) ex.SetCodePath(filepath.Join(baseDir, "code")) // note: create file before run ex.setJobWorkingDir(context.Background()) @@ -261,7 +263,7 @@ func TestExecutor_Logs(t *testing.T) { // \033[31m = red text, \033[1;32m = bold green text, \033[0m = reset ex.jobSpec.Commands = append(ex.jobSpec.Commands, "printf '\\033[31mRed Hello World\\033[0m\\n' && printf '\\033[1;32mBold Green Line 2\\033[0m\\n' && printf 'Line 3\\n'") - err := ex.execJob(context.TODO(), io.Writer(&b)) + err := ex.execJob(t.Context(), io.Writer(&b)) assert.NoError(t, err) logHistory := ex.GetHistory(0).JobLogs @@ -285,7 +287,7 @@ func TestExecutor_LogsWithErrors(t *testing.T) { ex := makeTestExecutor(t) ex.jobSpec.Commands = append(ex.jobSpec.Commands, "echo 'Success message' && echo 'Error message' >&2 && exit 1") - err := ex.execJob(context.TODO(), io.Writer(&b)) + err := ex.execJob(t.Context(), io.Writer(&b)) assert.Error(t, err) logHistory := ex.GetHistory(0).JobLogs @@ -309,7 +311,7 @@ func TestExecutor_LogsAnsiCodeHandling(t *testing.T) { ex.jobSpec.Commands = append(ex.jobSpec.Commands, cmd) - err := ex.execJob(context.TODO(), io.Writer(&b)) + err := ex.execJob(t.Context(), io.Writer(&b)) assert.NoError(t, err) // 1. Check WebSocket logs, which should preserve ANSI codes. diff --git a/runner/internal/runner/api/server.go b/runner/internal/runner/api/server.go index 2e8a526273..0a0b851a9f 100644 --- a/runner/internal/runner/api/server.go +++ b/runner/internal/runner/api/server.go @@ -34,9 +34,12 @@ type Server struct { version string } -func NewServer(ctx context.Context, tempDir string, homeDir string, address string, sshd ssh.SshdManager, version string) (*Server, error) { +func NewServer( + ctx context.Context, tempDir string, homeDir string, dstackDir string, sshd ssh.SshdManager, + address string, version string, +) (*Server, error) { r := api.NewRouter() - ex, err := executor.NewRunExecutor(tempDir, homeDir, sshd) + ex, err := executor.NewRunExecutor(tempDir, homeDir, dstackDir, sshd) if err != nil { return nil, err } From f174dff2eea7f55cc56b5736efc36ba74297940d Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Tue, 6 Jan 2026 16:19:33 +0100 Subject: [PATCH 038/187] [Crusoe] Minor edits (#3448) --- examples/clusters/crusoe/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/clusters/crusoe/README.md b/examples/clusters/crusoe/README.md index 17b1b5f019..fb6f4b1a7f 100644 --- a/examples/clusters/crusoe/README.md +++ b/examples/clusters/crusoe/README.md @@ -6,12 +6,12 @@ title: Distributed workload orchestration on Crusoe with dstack Crusoe offers two ways to use clusters with fast interconnect: -* [Kubernetes](#kubernetes) – Lets you interact with clusters through the Kubernetes API and includes support for NVIDIA GPU operators and related tools. -* [Virtual Machines (VMs)](#vms) – Gives you direct access to clusters in the form of virtual machines. +* [Crusoe Managed Kubernetes](#kubernetes) – Lets you interact with clusters through the Kubernetes API and includes support for NVIDIA and AMD GPU operators and related tools. +* [Virtual Machines (VMs)](#vms) – Gives you direct access to clusters in the form of virtual machines with NVIDIA and AMD GPUs. Both options use the same underlying networking infrastructure. This example walks you through how to set up Crusoe clusters to use with `dstack`. -## Kubernetes +## Crusoe Managed Kubernetes { #kubernetes } !!! info "Prerequsisites" 1. Go `Networking` → `Firewall Rules`, click `Create Firewall Rule`, and allow ingress traffic on port `30022`. This port will be used by the `dstack` server to access the jump host. @@ -21,7 +21,7 @@ Both options use the same underlying networking infrastructure. This example wal ### Configure the backend -Follow the standard instructions for setting up a [Kubernetes](https://dstack.ai/docs/concepts/backends/#kubernetes) backend: +Follow the standard instructions for setting up a [`kubernetes`](https://dstack.ai/docs/concepts/backends/#kubernetes) backend:
@@ -40,7 +40,7 @@ projects: ### Create a fleet -Once the Kubernetes cluster and the `dstack` server are running, you can create a fleet: +Once the Crusoe Managed Kubernetes cluster and the `dstack` server are running, you can create a fleet:
@@ -118,9 +118,9 @@ Once the fleet is created, you can run [dev environments](https://dstack.ai/docs Use a [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-task) that runs NCCL tests to validate cluster network bandwidth. -=== "Kubernetes" +=== "Crusoe Managed Kubernetes" - If you’re running on Crusoe’s Kubernetes, make sure to install HPC-X and provide an up-to-date topology file. + If you’re running on Crusoe Managed Kubernetes, make sure to install HPC-X and provide an up-to-date topology file.
@@ -185,9 +185,9 @@ Use a [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-task) > The task above downloads an A100 topology file from a Gist. The most reliable way to obtain the latest topology is to copy it from a Crusoe-provisioned VM (see [VMs](#vms)). ??? info "Privileged" - When running on Kubernetes, set `privileged` to `true` to ensure access to InfiniBand. + When running on Crusoe Managed Kubernetes, set `privileged` to `true` to ensure access to InfiniBand. -=== "SSH fleets" +=== "VMs" With Crusoe VMs, HPC-X and up-to-date topology files are already available on the hosts. When using SSH fleets, simply mount them via [instance volumes](https://dstack.ai/docs/concepts/volumes#instance-volumes). @@ -294,4 +294,4 @@ nccl-tests provisioning completed (running) 1. Learn about [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), [services](https://dstack.ai/docs/concepts/services) 2. Read the [Kuberentes](https://dstack.ai/docs/guides/kubernetes), and [Clusters](https://dstack.ai/docs/guides/clusters) guides -3. Check Crusoe's docs on [networking](https://docs.crusoecloud.com/networking/infiniband/) and [Kubernetes](https://docs.crusoecloud.com/orchestration/cmk/index.html) +3. Check the docs on [Crusoe's networking](https://docs.crusoecloud.com/networking/infiniband/) and ["Crusoe Managed" Kubernetes](https://docs.crusoecloud.com/orchestration/cmk/index.html) From d4680c998425e3bef1912833e0ee098a33358c1f Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 7 Jan 2026 16:57:15 +0100 Subject: [PATCH 039/187] [Dev environments] Support windsurf IDE (#3444) * [Dev environments] Support windsurf IDE #3443 * [Dev environments] Support windsurf IDE #3443 Replaced `surf` with `windsurf`(which is the only one alaviable on Linux) --- .../Runs/CreateDevEnvironment/constants.tsx | 25 ++++++++ .../pages/Runs/CreateDevEnvironment/index.tsx | 15 +---- .../pages/Runs/CreateDevEnvironment/types.ts | 2 +- .../index.tsx | 11 ++-- frontend/src/types/run.d.ts | 2 +- .../cli/services/configurators/run.py | 58 +++++++++++++++++++ .../_internal/core/models/configurations.py | 26 ++++++++- .../server/services/jobs/configurators/dev.py | 3 + .../jobs/configurators/extensions/windsurf.py | 43 ++++++++++++++ .../core/models/test_configurations.py | 52 ++++++++++++++++- 10 files changed, 214 insertions(+), 23 deletions(-) create mode 100644 src/dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py diff --git a/frontend/src/pages/Runs/CreateDevEnvironment/constants.tsx b/frontend/src/pages/Runs/CreateDevEnvironment/constants.tsx index 15593b6dfb..98955d6a50 100644 --- a/frontend/src/pages/Runs/CreateDevEnvironment/constants.tsx +++ b/frontend/src/pages/Runs/CreateDevEnvironment/constants.tsx @@ -23,3 +23,28 @@ export const FORM_FIELD_NAMES = { repo_path: 'repo_path', working_dir: 'working_dir', } as const satisfies Record; + +export const IDE_OPTIONS = [ + { + label: 'Cursor', + value: 'cursor', + }, + { + label: 'VS Code', + value: 'vscode', + }, + { + label: 'Windsurf', + value: 'windsurf', + }, +] as const; + +export const IDE_DISPLAY_NAMES: Record = { + cursor: 'Cursor', + vscode: 'VS Code', + windsurf: 'Windsurf', +}; + +export const getIDEDisplayName = (ide: string): string => { + return IDE_DISPLAY_NAMES[ide] || 'IDE'; +}; diff --git a/frontend/src/pages/Runs/CreateDevEnvironment/index.tsx b/frontend/src/pages/Runs/CreateDevEnvironment/index.tsx index 278bc5b3c5..af52513760 100644 --- a/frontend/src/pages/Runs/CreateDevEnvironment/index.tsx +++ b/frontend/src/pages/Runs/CreateDevEnvironment/index.tsx @@ -21,7 +21,7 @@ import { NoFleetProjectAlert } from 'pages/Project/components/NoFleetProjectAler import { useGenerateYaml } from './hooks/useGenerateYaml'; import { useGetRunSpecFromYaml } from './hooks/useGetRunSpecFromYaml'; -import { FORM_FIELD_NAMES } from './constants'; +import { FORM_FIELD_NAMES, IDE_OPTIONS } from './constants'; import { IRunEnvironmentFormKeys, IRunEnvironmentFormValues } from './types'; @@ -32,17 +32,6 @@ const namesFieldError = 'Only latin characters, dashes, and digits'; const urlFormatError = 'Only URLs'; const workingDirFormatError = 'Must be an absolute path'; -const ideOptions = [ - { - label: 'Cursor', - value: 'cursor', - }, - { - label: 'VS Code', - value: 'vscode', - }, -]; - enum DockerPythonTabs { DOCKER = 'docker', PYTHON = 'python', @@ -348,7 +337,7 @@ export const CreateDevEnvironment: React.FC = () => { description={t('runs.dev_env.wizard.ide_description')} control={control} name="ide" - options={ideOptions} + options={IDE_OPTIONS} disabled={loading} /> diff --git a/frontend/src/pages/Runs/CreateDevEnvironment/types.ts b/frontend/src/pages/Runs/CreateDevEnvironment/types.ts index 020d40e59c..ab504e9875 100644 --- a/frontend/src/pages/Runs/CreateDevEnvironment/types.ts +++ b/frontend/src/pages/Runs/CreateDevEnvironment/types.ts @@ -1,7 +1,7 @@ export interface IRunEnvironmentFormValues { offer: IGpu; name: string; - ide: 'cursor' | 'vscode'; + ide: 'cursor' | 'vscode' | 'windsurf'; config_yaml: string; docker: boolean; image?: string; diff --git a/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx b/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx index ee72ebae93..b2751253ab 100644 --- a/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx +++ b/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx @@ -19,6 +19,7 @@ import { import { copyToClipboard } from 'libs'; import { useConfigProjectCliCommand } from 'pages/Project/hooks/useConfigProjectCliComand'; +import { getIDEDisplayName } from 'pages/Runs/CreateDevEnvironment/constants'; import styles from './styles.module.scss'; @@ -52,7 +53,9 @@ export const ConnectToRunWithDevEnvConfiguration: FC<{ run: IRun }> = ({ run }) const [attachCommand, copyAttachCommand] = getAttachCommand(run); const [sshCommand, copySSHCommand] = getSSHCommand(run); - const openInIDEUrl = `${run.run_spec.configuration.ide}://vscode-remote/ssh-remote+${run.run_spec.run_name}/${run.run_spec.working_dir || 'workflow'}`; + const configuration = run.run_spec.configuration as TDevEnvironmentConfiguration; + const openInIDEUrl = `${configuration.ide}://vscode-remote/ssh-remote+${run.run_spec.run_name}/${run.run_spec.working_dir || 'workflow'}`; + const ideDisplayName = getIDEDisplayName(configuration.ide); const [configCliCommand, copyCliCommand] = useConfigProjectCliCommand({ projectName: run.project_name }); @@ -74,7 +77,7 @@ export const ConnectToRunWithDevEnvConfiguration: FC<{ run: IRun }> = ({ run }) onNavigate={({ detail }) => setActiveStepIndex(detail.requestedStepIndex)} activeStepIndex={activeStepIndex} onSubmit={() => window.open(openInIDEUrl, '_blank')} - submitButtonText="Open in VS Code" + submitButtonText={`Open in ${ideDisplayName}`} allowSkipTo steps={[ { @@ -216,7 +219,7 @@ export const ConnectToRunWithDevEnvConfiguration: FC<{ run: IRun }> = ({ run }) }, { title: 'Open', - description: 'After the CLI is attached, you can open the dev environment in VS Code.', + description: `After the CLI is attached, you can open the dev environment in ${ideDisplayName}.`, content: ( diff --git a/frontend/src/types/run.d.ts b/frontend/src/types/run.d.ts index 81146b2f66..452d3a9f41 100644 --- a/frontend/src/types/run.d.ts +++ b/frontend/src/types/run.d.ts @@ -14,7 +14,7 @@ declare type TGPUResources = IGPUSpecRequest & { name?: string | string[]; }; -declare type TIde = 'cursor' | 'vscode'; +declare type TIde = 'cursor' | 'vscode' | 'windsurf'; declare type TVolumeMountPointRequest = { name: string | string[]; diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py index d025160d0c..3d126dd34f 100644 --- a/src/dstack/_internal/cli/services/configurators/run.py +++ b/src/dstack/_internal/cli/services/configurators/run.py @@ -1,5 +1,8 @@ import argparse +import json +import os import shlex +import shutil import subprocess import sys import time @@ -677,6 +680,14 @@ def apply_args(self, conf: DevEnvironmentConfiguration, args: argparse.Namespace "Fix by opening [code]Command Palette[/code], executing [code]Shell Command: " "Install 'cursor' command in PATH[/code], and restarting terminal.[/]\n" ) + if conf.ide == "windsurf" and conf.version is None: + conf.version = _detect_windsurf_version() + if conf.version is None: + console.print( + "[secondary]Unable to detect the Windsurf version and pre-install extensions. " + "Fix by opening [code]Command Palette[/code], executing [code]Shell Command: " + "Install 'surf' command in PATH[/code], and restarting terminal.[/]\n" + ) class ServiceConfigurator(RunWithCommandsConfiguratorMixin, BaseRunConfigurator): @@ -730,6 +741,53 @@ def _detect_cursor_version(exe: str = "cursor") -> Optional[str]: return None +def _detect_windsurf_version(exe: str = "windsurf") -> Optional[str]: + """ + Detects the installed Windsurf product version and commit hash. + Returns string in format 'version@commit' (e.g., '1.13.5@97d7a...') or None. + """ + # 1. Locate executable in PATH + cmd_path = shutil.which(exe) + if not cmd_path: + return None + + try: + # 2. Resolve symlinks to find the actual installation directory + current_dir = os.path.dirname(os.path.realpath(cmd_path)) + + # 3. Walk up directory tree to find 'resources/app/product.json' + # Covers Linux (/opt/...), macOS (Contents/Resources/...), and Windows + for _ in range(6): + # Check standard lowercase and macOS TitleCase + for resource_folder in ["resources", "Resources"]: + json_path = os.path.join(current_dir, resource_folder, "app", "product.json") + + if os.path.exists(json_path): + try: + with open(json_path, "r", encoding="utf-8") as f: + data = json.load(f) + # Key 'windsurfVersion' is the product version (1.13.5) + # Key 'version' is the base VS Code version (1.9x) + ver = data.get("windsurfVersion") + commit = data.get("commit") + + if ver and commit: + return f"{ver}@{commit}" + except (OSError, json.JSONDecodeError): + continue + + # Move up one directory level + parent = os.path.dirname(current_dir) + if parent == current_dir: # Reached filesystem root + break + current_dir = parent + + except Exception: + return None + + return None + + def _print_service_urls(run: Run) -> None: if run._run.run_spec.configuration.type != RunConfigurationType.SERVICE.value: return diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 9c44155564..4558aebb11 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -619,10 +619,17 @@ def check_image_or_commands_present(cls, values): class DevEnvironmentConfigurationParams(CoreModel): ide: Annotated[ - Union[Literal["vscode"], Literal["cursor"]], - Field(description="The IDE to run. Supported values include `vscode` and `cursor`"), + Union[Literal["vscode"], Literal["cursor"], Literal["windsurf"]], + Field( + description="The IDE to run. Supported values include `vscode`, `cursor`, and `windsurf`" + ), ] - version: Annotated[Optional[str], Field(description="The version of the IDE")] = None + version: Annotated[ + Optional[str], + Field( + description="The version of the IDE. For `windsurf`, the version is in the format `version@commit`" + ), + ] = None init: Annotated[CommandsList, Field(description="The shell commands to run on startup")] = [] inactivity_duration: Annotated[ Optional[Union[Literal["off"], int, bool, str]], @@ -649,6 +656,19 @@ def parse_inactivity_duration( return v return None + @root_validator + def validate_windsurf_version_format(cls, values): + ide = values.get("ide") + version = values.get("version") + if ide == "windsurf" and version: + # Validate format: version@commit + if not re.match(r"^.+@[a-f0-9]+$", version): + raise ValueError( + f"Invalid Windsurf version format: `{version}`. " + "Expected format: `version@commit` (e.g., `1.106.0@8951cd3ad688e789573d7f51750d67ae4a0bea7d`)" + ) + return values + class DevEnvironmentConfigurationConfig( ProfileParamsConfig, diff --git a/src/dstack/_internal/server/services/jobs/configurators/dev.py b/src/dstack/_internal/server/services/jobs/configurators/dev.py index 3efea3fa2a..da683a60cc 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/dev.py +++ b/src/dstack/_internal/server/services/jobs/configurators/dev.py @@ -7,6 +7,7 @@ from dstack._internal.server.services.jobs.configurators.base import JobConfigurator from dstack._internal.server.services.jobs.configurators.extensions.cursor import CursorDesktop from dstack._internal.server.services.jobs.configurators.extensions.vscode import VSCodeDesktop +from dstack._internal.server.services.jobs.configurators.extensions.windsurf import WindsurfDesktop INSTALL_IPYKERNEL = ( "(echo 'pip install ipykernel...' && pip install -q --no-cache-dir ipykernel 2> /dev/null) || " @@ -24,6 +25,8 @@ def __init__(self, run_spec: RunSpec, secrets: Dict[str, str]): __class = VSCodeDesktop elif run_spec.configuration.ide == "cursor": __class = CursorDesktop + elif run_spec.configuration.ide == "windsurf": + __class = WindsurfDesktop else: raise ServerClientError(f"Unsupported IDE: {run_spec.configuration.ide}") self.ide = __class( diff --git a/src/dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py b/src/dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py new file mode 100644 index 0000000000..63fee839f0 --- /dev/null +++ b/src/dstack/_internal/server/services/jobs/configurators/extensions/windsurf.py @@ -0,0 +1,43 @@ +from typing import List, Optional + + +class WindsurfDesktop: + def __init__( + self, + run_name: Optional[str], + version: Optional[str], + extensions: List[str], + ): + self.run_name = run_name + self.version = version + self.extensions = extensions + + def get_install_commands(self) -> List[str]: + commands = [] + if self.version is not None: + version, commit = self.version.split("@") + url = f"https://windsurf-stable.codeiumdata.com/linux-reh-$arch/stable/{commit}/windsurf-reh-linux-$arch-{version}.tar.gz" + archive = "windsurf-reh-linux-$arch.tar.gz" + target = f'~/.windsurf-server/bin/"{commit}"' + commands.extend( + [ + 'if [ $(uname -m) = "aarch64" ]; then arch="arm64"; else arch="x64"; fi', + "mkdir -p /tmp", + f'wget -q --show-progress "{url}" -O "/tmp/{archive}"', + f"mkdir -vp {target}", + f'tar --no-same-owner -xz --strip-components=1 -C {target} -f "/tmp/{archive}"', + f'rm "/tmp/{archive}"', + ] + ) + if self.extensions: + extensions = " ".join(f'--install-extension "{name}"' for name in self.extensions) + commands.append(f'PATH="$PATH":{target}/bin windsurf-server {extensions}') + return commands + + def get_print_readme_commands(self) -> List[str]: + return [ + "echo To open in Windsurf, use link below:", + "echo", + f'echo " windsurf://vscode-remote/ssh-remote+{self.run_name}$DSTACK_WORKING_DIR"', + "echo", + ] diff --git a/src/tests/_internal/core/models/test_configurations.py b/src/tests/_internal/core/models/test_configurations.py index 79007fe195..65eec62642 100644 --- a/src/tests/_internal/core/models/test_configurations.py +++ b/src/tests/_internal/core/models/test_configurations.py @@ -4,7 +4,11 @@ from dstack._internal.core.errors import ConfigurationError from dstack._internal.core.models.common import RegistryAuth -from dstack._internal.core.models.configurations import RepoSpec, parse_run_configuration +from dstack._internal.core.models.configurations import ( + DevEnvironmentConfigurationParams, + RepoSpec, + parse_run_configuration, +) from dstack._internal.core.models.resources import Range @@ -139,3 +143,49 @@ def test_registry_auth_hashable(): """ registry_auth = RegistryAuth(username="username", password="password") hash(registry_auth) + + +class TestDevEnvironmentConfigurationParams: + def test_windsurf_version_valid_format(self): + params = DevEnvironmentConfigurationParams( + ide="windsurf", version="1.106.0@8951cd3ad688e789573d7f51750d67ae4a0bea7d" + ) + assert params.ide == "windsurf" + assert params.version == "1.106.0@8951cd3ad688e789573d7f51750d67ae4a0bea7d" + + def test_windsurf_version_valid_short_commit(self): + params = DevEnvironmentConfigurationParams(ide="windsurf", version="1.0.0@abc123") + assert params.version == "1.0.0@abc123" + + def test_windsurf_version_empty_allowed(self): + params = DevEnvironmentConfigurationParams(ide="windsurf", version=None) + assert params.ide == "windsurf" + assert params.version is None + + def test_windsurf_version_invalid_missing_at(self): + with pytest.raises(ValueError, match="Invalid Windsurf version format"): + DevEnvironmentConfigurationParams(ide="windsurf", version="1.106.0") + + def test_windsurf_version_invalid_missing_commit(self): + with pytest.raises(ValueError, match="Invalid Windsurf version format"): + DevEnvironmentConfigurationParams(ide="windsurf", version="1.106.0@") + + def test_windsurf_version_invalid_missing_version(self): + with pytest.raises(ValueError, match="Invalid Windsurf version format"): + DevEnvironmentConfigurationParams( + ide="windsurf", version="@8951cd3ad688e789573d7f51750d67ae4a0bea7d" + ) + + def test_windsurf_version_invalid_non_hex_commit(self): + with pytest.raises(ValueError, match="Invalid Windsurf version format"): + DevEnvironmentConfigurationParams(ide="windsurf", version="1.106.0@ghijklmnop") + + def test_vscode_version_not_validated(self): + params = DevEnvironmentConfigurationParams(ide="vscode", version="1.80.0") + assert params.ide == "vscode" + assert params.version == "1.80.0" + + def test_cursor_version_not_validated(self): + params = DevEnvironmentConfigurationParams(ide="cursor", version="0.40.0") + assert params.ide == "cursor" + assert params.version == "0.40.0" From dd907bf127c87fd2dde4ebeff6638388f3f48468 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Thu, 8 Jan 2026 07:38:18 +0000 Subject: [PATCH 040/187] Add `processing instance` debug log message (#3450) --- .../_internal/server/background/tasks/process_instances.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index 9d75c58756..2241c4c6a4 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -206,6 +206,7 @@ async def _process_next_instance(): async def _process_instance(session: AsyncSession, instance: InstanceModel): + logger.debug("%s: processing instance, status: %s", fmt(instance), instance.status.upper()) # Refetch to load related attributes. # Load related attributes only for statuses that always need them. if instance.status in ( From 2a4c0e1751e62595dce3c1a646b7b8be157b2d47 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 8 Jan 2026 08:55:50 +0000 Subject: [PATCH 041/187] [runner] Decouple Server and Executor (#3447) * Pass Executor to Server as an argument * Move repo blob-related code from the API handler to a new Executor method * Fix http.MaxBytesError handling --- runner/cmd/runner/main.go | 8 ++++- runner/internal/executor/base.go | 4 +-- runner/internal/executor/executor.go | 37 +++++++++---------- runner/internal/executor/executor_test.go | 26 +++++++------- runner/internal/executor/files.go | 13 ++++--- runner/internal/executor/repo.go | 29 +++++++++++++-- runner/internal/runner/api/http.go | 44 +++++++++++++---------- runner/internal/runner/api/server.go | 14 ++------ 8 files changed, 99 insertions(+), 76 deletions(-) diff --git a/runner/cmd/runner/main.go b/runner/cmd/runner/main.go index a080246d41..27e529417a 100644 --- a/runner/cmd/runner/main.go +++ b/runner/cmd/runner/main.go @@ -15,6 +15,7 @@ import ( "github.com/urfave/cli/v3" "github.com/dstackai/dstack/runner/consts" + "github.com/dstackai/dstack/runner/internal/executor" "github.com/dstackai/dstack/runner/internal/log" "github.com/dstackai/dstack/runner/internal/runner/api" "github.com/dstackai/dstack/runner/internal/ssh" @@ -162,7 +163,12 @@ func start(ctx context.Context, tempDir string, homeDir string, httpPort int, ss } }() - server, err := api.NewServer(ctx, tempDir, homeDir, dstackDir, sshd, fmt.Sprintf(":%d", httpPort), version) + ex, err := executor.NewRunExecutor(tempDir, homeDir, dstackDir, sshd) + if err != nil { + return fmt.Errorf("create executor: %w", err) + } + + server, err := api.NewServer(ctx, fmt.Sprintf(":%d", httpPort), version, ex) if err != nil { return fmt.Errorf("create server: %w", err) } diff --git a/runner/internal/executor/base.go b/runner/internal/executor/base.go index 554bd7646a..4961180e99 100644 --- a/runner/internal/executor/base.go +++ b/runner/internal/executor/base.go @@ -13,7 +13,6 @@ type Executor interface { GetJobWsLogsHistory() []schemas.LogEvent GetRunnerState() string Run(ctx context.Context) error - SetCodePath(codePath string) SetJob(job schemas.SubmitBody) SetJobState(ctx context.Context, state types.JobState) SetJobStateWithTerminationReason( @@ -23,7 +22,8 @@ type Executor interface { termination_message string, ) SetRunnerState(state string) - AddFileArchive(id string, src io.Reader) error + WriteFileArchive(id string, src io.Reader) error + WriteRepoBlob(src io.Reader) error Lock() RLock() RUnlock() diff --git a/runner/internal/executor/executor.go b/runner/internal/executor/executor.go index 56a5d1cd9f..fc4039cf96 100644 --- a/runner/internal/executor/executor.go +++ b/runner/internal/executor/executor.go @@ -52,11 +52,12 @@ type ConnectionTracker interface { } type RunExecutor struct { - tempDir string - homeDir string - dstackDir string - archiveDir string - sshd ssh.SshdManager + tempDir string + homeDir string + dstackDir string + fileArchiveDir string + repoBlobDir string + sshd ssh.SshdManager currentUid uint32 @@ -67,7 +68,7 @@ type RunExecutor struct { secrets map[string]string repoCredentials *schemas.RepoCredentials repoDir string - codePath string + repoBlobPath string jobUid int jobGid int jobHomeDir string @@ -123,14 +124,15 @@ func NewRunExecutor(tempDir string, homeDir string, dstackDir string, sshd ssh.S } return &RunExecutor{ - tempDir: tempDir, - homeDir: homeDir, - dstackDir: dstackDir, - archiveDir: filepath.Join(tempDir, "file_archives"), - sshd: sshd, - currentUid: uid, - jobUid: -1, - jobGid: -1, + tempDir: tempDir, + homeDir: homeDir, + dstackDir: dstackDir, + fileArchiveDir: filepath.Join(tempDir, "file_archives"), + repoBlobDir: filepath.Join(tempDir, "repo_blobs"), + sshd: sshd, + currentUid: uid, + jobUid: -1, + jobGid: -1, mu: mu, state: WaitSubmit, @@ -145,7 +147,7 @@ func NewRunExecutor(tempDir string, homeDir string, dstackDir string, sshd ssh.S }, nil } -// Run must be called after SetJob and SetCodePath +// Run must be called after SetJob and WriteRepoBlob func (ex *RunExecutor) Run(ctx context.Context) (err error) { runnerLogFile, err := log.CreateAppendFile(filepath.Join(ex.tempDir, consts.RunnerLogFileName)) if err != nil { @@ -296,11 +298,6 @@ func (ex *RunExecutor) SetJob(body schemas.SubmitBody) { ex.state = WaitCode } -func (ex *RunExecutor) SetCodePath(codePath string) { - ex.codePath = codePath - ex.state = WaitRun -} - func (ex *RunExecutor) SetJobState(ctx context.Context, state types.JobState) { ex.SetJobStateWithTerminationReason(ctx, state, "", "") } diff --git a/runner/internal/executor/executor_test.go b/runner/internal/executor/executor_test.go index e3661fac0e..0d935dd642 100644 --- a/runner/internal/executor/executor_test.go +++ b/runner/internal/executor/executor_test.go @@ -69,7 +69,7 @@ func TestExecutor_HomeDir(t *testing.T) { func TestExecutor_NonZeroExit(t *testing.T) { ex := makeTestExecutor(t) ex.jobSpec.Commands = append(ex.jobSpec.Commands, "exit 100") - makeCodeTar(t, ex.codePath) + makeCodeTar(t, ex) err := ex.Run(t.Context()) assert.Error(t, err) @@ -104,7 +104,7 @@ func TestExecutor_LocalRepo(t *testing.T) { ex := makeTestExecutor(t) cmd := fmt.Sprintf("cat %s/foo", *ex.jobSpec.RepoDir) ex.jobSpec.Commands = append(ex.jobSpec.Commands, cmd) - makeCodeTar(t, ex.codePath) + makeCodeTar(t, ex) err := ex.setupRepo(t.Context()) require.NoError(t, err) @@ -117,7 +117,7 @@ func TestExecutor_LocalRepo(t *testing.T) { func TestExecutor_Recover(t *testing.T) { ex := makeTestExecutor(t) ex.jobSpec.Commands = nil // cause a panic - makeCodeTar(t, ex.codePath) + makeCodeTar(t, ex) err := ex.Run(t.Context()) assert.ErrorContains(t, err, "recovered: ") @@ -134,7 +134,7 @@ func TestExecutor_MaxDuration(t *testing.T) { ex.killDelay = 500 * time.Millisecond ex.jobSpec.Commands = append(ex.jobSpec.Commands, "echo 1 && sleep 2 && echo 2") ex.jobSpec.MaxDuration = 1 // seconds - makeCodeTar(t, ex.codePath) + makeCodeTar(t, ex) err := ex.Run(t.Context()) assert.ErrorContains(t, err, "killed") @@ -155,7 +155,7 @@ func TestExecutor_RemoteRepo(t *testing.T) { RepoConfigEmail: "developer@dstack.ai", } ex.jobSpec.Commands = append(ex.jobSpec.Commands, "git rev-parse HEAD && git config user.name && git config user.email") - err := os.WriteFile(ex.codePath, []byte{}, 0o600) // empty diff + err := ex.WriteRepoBlob(bytes.NewReader([]byte{})) // empty diff require.NoError(t, err) err = ex.setJobWorkingDir(t.Context()) @@ -210,19 +210,17 @@ func makeTestExecutor(t *testing.T) *RunExecutor { require.NoError(t, os.Mkdir(homeDir, 0o700)) dstackDir := filepath.Join(baseDir, "dstack") require.NoError(t, os.Mkdir(dstackDir, 0o755)) - ex, _ := NewRunExecutor(tempDir, homeDir, dstackDir, new(sshdMock)) + ex, err := NewRunExecutor(tempDir, homeDir, dstackDir, new(sshdMock)) + require.NoError(t, err) ex.SetJob(body) - ex.SetCodePath(filepath.Join(baseDir, "code")) // note: create file before run - ex.setJobWorkingDir(context.Background()) + require.NoError(t, ex.setJobWorkingDir(t.Context())) return ex } -func makeCodeTar(t *testing.T, path string) { +func makeCodeTar(t *testing.T, ex *RunExecutor) { t.Helper() - file, err := os.Create(path) - require.NoError(t, err) - defer func() { _ = file.Close() }() - tw := tar.NewWriter(file) + var b bytes.Buffer + tw := tar.NewWriter(&b) files := []struct{ name, body string }{ {"foo", "bar\n"}, @@ -235,6 +233,8 @@ func makeCodeTar(t *testing.T, path string) { require.NoError(t, err) } require.NoError(t, tw.Close()) + + require.NoError(t, ex.WriteRepoBlob(&b)) } func TestWriteDstackProfile(t *testing.T) { diff --git a/runner/internal/executor/files.go b/runner/internal/executor/files.go index c447006c35..ee1170c418 100644 --- a/runner/internal/executor/files.go +++ b/runner/internal/executor/files.go @@ -18,11 +18,11 @@ import ( var renameRegex = regexp.MustCompile(`^([^/]*)(/|$)`) -func (ex *RunExecutor) AddFileArchive(id string, src io.Reader) error { - if err := os.MkdirAll(ex.archiveDir, 0o755); err != nil { +func (ex *RunExecutor) WriteFileArchive(id string, src io.Reader) error { + if err := os.MkdirAll(ex.fileArchiveDir, 0o755); err != nil { return fmt.Errorf("create archive directory: %w", err) } - archivePath := path.Join(ex.archiveDir, id) + archivePath := path.Join(ex.fileArchiveDir, id) archive, err := os.Create(archivePath) if err != nil { return fmt.Errorf("create archive file: %w", err) @@ -45,13 +45,13 @@ func (ex *RunExecutor) setupFiles(ctx context.Context) error { return fmt.Errorf("setup files: working dir must be absolute: %s", ex.jobWorkingDir) } for _, fa := range ex.jobSpec.FileArchives { - archivePath := path.Join(ex.archiveDir, fa.Id) + archivePath := path.Join(ex.fileArchiveDir, fa.Id) if err := extractFileArchive(ctx, archivePath, fa.Path, ex.jobWorkingDir, ex.jobUid, ex.jobGid, ex.jobHomeDir); err != nil { return fmt.Errorf("extract file archive %s: %w", fa.Id, err) } } - if err := os.RemoveAll(ex.archiveDir); err != nil { - log.Warning(ctx, "Failed to remove file archives dir", "path", ex.archiveDir, "err", err) + if err := os.RemoveAll(ex.fileArchiveDir); err != nil { + log.Warning(ctx, "Failed to remove file archives dir", "path", ex.fileArchiveDir, "err", err) } return nil } @@ -90,7 +90,6 @@ func extractFileArchive(ctx context.Context, archivePath string, destPath string if uid != -1 || gid != -1 { for _, p := range paths { - log.Warning(ctx, "path", "path", p) if err := os.Chown(path.Join(destBase, p), uid, gid); err != nil { log.Warning(ctx, "Failed to chown", "path", p, "err", err) } diff --git a/runner/internal/executor/repo.go b/runner/internal/executor/repo.go index 32f623e70c..2f757f63c6 100644 --- a/runner/internal/executor/repo.go +++ b/runner/internal/executor/repo.go @@ -4,9 +4,11 @@ import ( "context" "errors" "fmt" + "io" "io/fs" "os" "os/exec" + "path" "path/filepath" "github.com/codeclysm/extract/v4" @@ -17,6 +19,23 @@ import ( "github.com/dstackai/dstack/runner/internal/schemas" ) +// WriteRepoBlob must be called after SetJob +func (ex *RunExecutor) WriteRepoBlob(src io.Reader) error { + if err := os.MkdirAll(ex.repoBlobDir, 0o755); err != nil { + return fmt.Errorf("create blob directory: %w", err) + } + ex.repoBlobPath = path.Join(ex.repoBlobDir, ex.run.RunSpec.RepoId) + blob, err := os.Create(ex.repoBlobPath) + if err != nil { + return fmt.Errorf("create blob file: %w", err) + } + defer func() { _ = blob.Close() }() + if _, err = io.Copy(blob, src); err != nil { + return fmt.Errorf("copy blob data: %w", err) + } + return nil +} + // setupRepo must be called from Run // Must be called after setJobWorkingDir and setJobCredentials func (ex *RunExecutor) setupRepo(ctx context.Context) error { @@ -100,6 +119,10 @@ func (ex *RunExecutor) setupRepo(ctx context.Context) error { return fmt.Errorf("chown repo dir: %w", err) } + if err := os.RemoveAll(ex.repoBlobDir); err != nil { + log.Warning(ctx, "Failed to remove repo blobs dir", "path", ex.repoBlobDir, "err", err) + } + return err } @@ -143,7 +166,7 @@ func (ex *RunExecutor) prepareGit(ctx context.Context) error { } log.Trace(ctx, "Applying diff") - repoDiff, err := os.ReadFile(ex.codePath) + repoDiff, err := os.ReadFile(ex.repoBlobPath) if err != nil { return fmt.Errorf("read repo diff: %w", err) } @@ -156,12 +179,12 @@ func (ex *RunExecutor) prepareGit(ctx context.Context) error { } func (ex *RunExecutor) prepareArchive(ctx context.Context) error { - file, err := os.Open(ex.codePath) + file, err := os.Open(ex.repoBlobPath) if err != nil { return fmt.Errorf("open code archive: %w", err) } defer func() { _ = file.Close() }() - log.Trace(ctx, "Extracting code archive", "src", ex.codePath, "dst", ex.repoDir) + log.Trace(ctx, "Extracting code archive", "src", ex.repoBlobPath, "dst", ex.repoDir) if err := extract.Tar(ctx, file, ex.repoDir, nil); err != nil { return fmt.Errorf("extract tar archive: %w", err) } diff --git a/runner/internal/runner/api/http.go b/runner/internal/runner/api/http.go index bbf416efbe..87eb96e0af 100644 --- a/runner/internal/runner/api/http.go +++ b/runner/internal/runner/api/http.go @@ -9,8 +9,6 @@ import ( "mime" "mime/multipart" "net/http" - "os" - "path/filepath" "strconv" "github.com/dstackai/dstack/runner/internal/api" @@ -19,6 +17,9 @@ import ( "github.com/dstackai/dstack/runner/internal/schemas" ) +// TODO: set some reasonable value; (optional) make configurable +const maxBodySize = math.MaxInt64 + func (s *Server) healthcheckGetHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { return &schemas.HealthcheckResponse{ Service: "dstack-runner", @@ -84,13 +85,16 @@ func (s *Server) uploadArchivePostHandler(w http.ResponseWriter, r *http.Request return nil, &api.Error{Status: http.StatusBadRequest, Msg: "missing boundary"} } - r.Body = http.MaxBytesReader(w, r.Body, math.MaxInt64) + r.Body = http.MaxBytesReader(w, r.Body, maxBodySize) formReader := multipart.NewReader(r.Body, boundary) part, err := formReader.NextPart() - if errors.Is(err, io.EOF) { - return nil, &api.Error{Status: http.StatusBadRequest, Msg: "empty form"} - } if err != nil { + if errors.Is(err, io.EOF) { + return nil, &api.Error{Status: http.StatusBadRequest, Msg: "empty form"} + } + if isMaxBytesError(err) { + return nil, &api.Error{Status: http.StatusRequestEntityTooLarge} + } return nil, fmt.Errorf("read multipart form: %w", err) } defer func() { _ = part.Close() }() @@ -106,8 +110,11 @@ func (s *Server) uploadArchivePostHandler(w http.ResponseWriter, r *http.Request if archiveId == "" { return nil, &api.Error{Status: http.StatusBadRequest, Msg: "missing file name"} } - if err := s.executor.AddFileArchive(archiveId, part); err != nil { - return nil, fmt.Errorf("add file archive: %w", err) + if err := s.executor.WriteFileArchive(archiveId, part); err != nil { + if isMaxBytesError(err) { + return nil, &api.Error{Status: http.StatusRequestEntityTooLarge} + } + return nil, fmt.Errorf("write file archive: %w", err) } if _, err := formReader.NextPart(); !errors.Is(err, io.EOF) { return nil, &api.Error{Status: http.StatusBadRequest, Msg: "extra form field(s)"} @@ -123,21 +130,17 @@ func (s *Server) uploadCodePostHandler(w http.ResponseWriter, r *http.Request) ( return nil, &api.Error{Status: http.StatusConflict} } - r.Body = http.MaxBytesReader(w, r.Body, math.MaxInt64) - codePath := filepath.Join(s.tempDir, "code") // todo random name? - file, err := os.Create(codePath) - if err != nil { - return nil, fmt.Errorf("create code file: %w", err) - } - defer func() { _ = file.Close() }() - if _, err = io.Copy(file, r.Body); err != nil { - if err.Error() == "http: request body too large" { + r.Body = http.MaxBytesReader(w, r.Body, maxBodySize) + + if err := s.executor.WriteRepoBlob(r.Body); err != nil { + if isMaxBytesError(err) { return nil, &api.Error{Status: http.StatusRequestEntityTooLarge} } return nil, fmt.Errorf("copy request body: %w", err) } - s.executor.SetCodePath(codePath) + s.executor.SetRunnerState(executor.WaitRun) + return nil, nil } @@ -181,3 +184,8 @@ func (s *Server) stopPostHandler(w http.ResponseWriter, r *http.Request) (interf s.stop() return nil, nil } + +func isMaxBytesError(err error) bool { + var maxBytesError *http.MaxBytesError + return errors.As(err, &maxBytesError) +} diff --git a/runner/internal/runner/api/server.go b/runner/internal/runner/api/server.go index 0a0b851a9f..ba577d1a5b 100644 --- a/runner/internal/runner/api/server.go +++ b/runner/internal/runner/api/server.go @@ -11,12 +11,10 @@ import ( "github.com/dstackai/dstack/runner/internal/executor" "github.com/dstackai/dstack/runner/internal/log" "github.com/dstackai/dstack/runner/internal/metrics" - "github.com/dstackai/dstack/runner/internal/ssh" ) type Server struct { - srv *http.Server - tempDir string + srv *http.Server shutdownCh chan interface{} // server closes this chan on shutdown jobBarrierCh chan interface{} // only server listens on this chan @@ -34,15 +32,8 @@ type Server struct { version string } -func NewServer( - ctx context.Context, tempDir string, homeDir string, dstackDir string, sshd ssh.SshdManager, - address string, version string, -) (*Server, error) { +func NewServer(ctx context.Context, address string, version string, ex executor.Executor) (*Server, error) { r := api.NewRouter() - ex, err := executor.NewRunExecutor(tempDir, homeDir, dstackDir, sshd) - if err != nil { - return nil, err - } metricsCollector, err := metrics.NewMetricsCollector(ctx) if err != nil { @@ -54,7 +45,6 @@ func NewServer( Addr: address, Handler: r, }, - tempDir: tempDir, shutdownCh: make(chan interface{}), jobBarrierCh: make(chan interface{}), From d48b15fb5450fb5a4084e6c1d79233113dba83d0 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Thu, 8 Jan 2026 10:01:56 +0100 Subject: [PATCH 042/187] [Feature] Allow to see JSON state of runs/volumes/fleets/gateways via CLI/UI (#3445) * [Feature] Added `dstack inspect` CLI command * [Feature] Add `Inspect` tab to the run and fleet pages * PR review feedback. Replaced `dstack inspect run|fleet|volume|gateway` with seprate commands `datack run|fleet|volume|gateway list --json`. * PR review: `dstack run|fleet get --json` Updated how UUID format errors are handled * PR review: `dstack run|fleet get --json` Better handling edge case (empty ID) --- frontend/src/locale/en.json | 2 + .../pages/Fleets/Details/Inspect/index.tsx | 113 ++++++++++++++++++ frontend/src/pages/Fleets/Details/index.tsx | 6 + .../src/pages/Runs/Details/Inspect/index.tsx | 108 +++++++++++++++++ frontend/src/pages/Runs/Details/constants.ts | 1 + frontend/src/pages/Runs/Details/index.tsx | 5 + frontend/src/router.tsx | 10 ++ frontend/src/routes.ts | 10 ++ src/dstack/_internal/cli/commands/fleet.py | 49 +++++++- src/dstack/_internal/cli/commands/gateway.py | 29 ++++- src/dstack/_internal/cli/commands/run.py | 69 +++++++++++ src/dstack/_internal/cli/commands/volume.py | 27 +++++ src/dstack/_internal/cli/main.py | 2 + src/dstack/api/server/_fleets.py | 13 +- src/dstack/api/server/_runs.py | 10 +- 15 files changed, 447 insertions(+), 7 deletions(-) create mode 100644 frontend/src/pages/Fleets/Details/Inspect/index.tsx create mode 100644 frontend/src/pages/Runs/Details/Inspect/index.tsx create mode 100644 src/dstack/_internal/cli/commands/run.py diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index f026151083..da3fe00fa9 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -396,6 +396,7 @@ "log": "Logs", "log_empty_message_title": "No logs", "log_empty_message_text": "No logs to display.", + "inspect": "Inspect", "run_name": "Name", "workflow_name": "Workflow", "configuration": "Configuration", @@ -573,6 +574,7 @@ "fleet_placeholder": "Filtering by fleet", "fleet_name": "Fleet name", "total_instances": "Number of instances", + "inspect": "Inspect", "empty_message_title": "No fleets", "empty_message_text": "No fleets to display.", "nomatch_message_title": "No matches", diff --git a/frontend/src/pages/Fleets/Details/Inspect/index.tsx b/frontend/src/pages/Fleets/Details/Inspect/index.tsx new file mode 100644 index 0000000000..844ebe849d --- /dev/null +++ b/frontend/src/pages/Fleets/Details/Inspect/index.tsx @@ -0,0 +1,113 @@ +import React, { useEffect, useMemo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; +import ace from 'ace-builds'; +import CodeEditor, { CodeEditorProps } from '@cloudscape-design/components/code-editor'; +import { Mode } from '@cloudscape-design/global-styles'; + +import { Container, Header, Loader } from 'components'; +import { CODE_EDITOR_I18N_STRINGS } from 'components/form/CodeEditor/constants'; + +import { useAppSelector } from 'hooks'; +import { useGetFleetDetailsQuery } from 'services/fleet'; + +import { selectSystemMode } from 'App/slice'; + +import 'ace-builds/src-noconflict/theme-cloud_editor'; +import 'ace-builds/src-noconflict/theme-cloud_editor_dark'; +import 'ace-builds/src-noconflict/mode-json'; +import 'ace-builds/src-noconflict/ext-language_tools'; + +ace.config.set('useWorker', false); + +interface AceEditorElement extends HTMLElement { + env?: { + editor?: { + setReadOnly: (readOnly: boolean) => void; + }; + }; +} + +export const FleetInspect = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramFleetId = params.fleetId ?? ''; + + const systemMode = useAppSelector(selectSystemMode) ?? ''; + + const { data: fleetData, isLoading } = useGetFleetDetailsQuery( + { + projectName: paramProjectName, + fleetId: paramFleetId, + }, + { + refetchOnMountOrArgChange: true, + }, + ); + + const [codeEditorPreferences, setCodeEditorPreferences] = useState(() => ({ + theme: systemMode === Mode.Dark ? 'cloud_editor_dark' : 'cloud_editor', + })); + + useEffect(() => { + if (systemMode === Mode.Dark) + setCodeEditorPreferences({ + theme: 'cloud_editor_dark', + }); + else + setCodeEditorPreferences({ + theme: 'cloud_editor', + }); + }, [systemMode]); + + const onCodeEditorPreferencesChange: CodeEditorProps['onPreferencesChange'] = (e) => { + setCodeEditorPreferences(e.detail); + }; + + const jsonContent = useMemo(() => { + if (!fleetData) return ''; + return JSON.stringify(fleetData, null, 2); + }, [fleetData]); + + // Set editor to read-only after it loads + useEffect(() => { + const timer = setTimeout(() => { + // Find the ace editor instance in the DOM + const editorElements = document.querySelectorAll('.ace_editor'); + editorElements.forEach((element: Element) => { + const aceEditor = (element as AceEditorElement).env?.editor; + if (aceEditor) { + aceEditor.setReadOnly(true); + } + }); + }, 100); + + return () => clearTimeout(timer); + }, [jsonContent]); + + if (isLoading) + return ( + + + + ); + + return ( + {t('fleets.inspect')}
}> + { + // Prevent editing - onChange is required but we ignore changes + }} + /> + + ); +}; diff --git a/frontend/src/pages/Fleets/Details/index.tsx b/frontend/src/pages/Fleets/Details/index.tsx index d3690fcff2..6e5d9e6d7a 100644 --- a/frontend/src/pages/Fleets/Details/index.tsx +++ b/frontend/src/pages/Fleets/Details/index.tsx @@ -7,6 +7,7 @@ import { Button, ContentLayout, DetailsHeader, Tabs } from 'components'; enum CodeTab { Details = 'details', Events = 'events', + Inspect = 'inspect', } import { useBreadcrumbs } from 'hooks'; @@ -96,6 +97,11 @@ export const FleetDetails: React.FC = () => { id: CodeTab.Events, href: ROUTES.FLEETS.DETAILS.EVENTS.FORMAT(paramProjectName, paramFleetId), }, + { + label: 'Inspect', + id: CodeTab.Inspect, + href: ROUTES.FLEETS.DETAILS.INSPECT.FORMAT(paramProjectName, paramFleetId), + }, ]} /> diff --git a/frontend/src/pages/Runs/Details/Inspect/index.tsx b/frontend/src/pages/Runs/Details/Inspect/index.tsx new file mode 100644 index 0000000000..f37aa90ad3 --- /dev/null +++ b/frontend/src/pages/Runs/Details/Inspect/index.tsx @@ -0,0 +1,108 @@ +import React, { useEffect, useMemo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; +import ace from 'ace-builds'; +import CodeEditor, { CodeEditorProps } from '@cloudscape-design/components/code-editor'; +import { Mode } from '@cloudscape-design/global-styles'; + +import { Container, Header, Loader } from 'components'; +import { CODE_EDITOR_I18N_STRINGS } from 'components/form/CodeEditor/constants'; + +import { useAppSelector } from 'hooks'; +import { useGetRunQuery } from 'services/run'; + +import { selectSystemMode } from 'App/slice'; + +import 'ace-builds/src-noconflict/theme-cloud_editor'; +import 'ace-builds/src-noconflict/theme-cloud_editor_dark'; +import 'ace-builds/src-noconflict/mode-json'; +import 'ace-builds/src-noconflict/ext-language_tools'; + +ace.config.set('useWorker', false); + +interface AceEditorElement extends HTMLElement { + env?: { + editor?: { + setReadOnly: (readOnly: boolean) => void; + }; + }; +} + +export const RunInspect = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramRunId = params.runId ?? ''; + + const systemMode = useAppSelector(selectSystemMode) ?? ''; + + const { data: runData, isLoading } = useGetRunQuery({ + project_name: paramProjectName, + id: paramRunId, + }); + + const [codeEditorPreferences, setCodeEditorPreferences] = useState(() => ({ + theme: systemMode === Mode.Dark ? 'cloud_editor_dark' : 'cloud_editor', + })); + + useEffect(() => { + if (systemMode === Mode.Dark) + setCodeEditorPreferences({ + theme: 'cloud_editor_dark', + }); + else + setCodeEditorPreferences({ + theme: 'cloud_editor', + }); + }, [systemMode]); + + const onCodeEditorPreferencesChange: CodeEditorProps['onPreferencesChange'] = (e) => { + setCodeEditorPreferences(e.detail); + }; + + const jsonContent = useMemo(() => { + if (!runData) return ''; + return JSON.stringify(runData, null, 2); + }, [runData]); + + // Set editor to read-only after it loads + useEffect(() => { + const timer = setTimeout(() => { + // Find the ace editor instance in the DOM + const editorElements = document.querySelectorAll('.ace_editor'); + editorElements.forEach((element: Element) => { + const aceEditor = (element as AceEditorElement).env?.editor; + if (aceEditor) { + aceEditor.setReadOnly(true); + } + }); + }, 100); + + return () => clearTimeout(timer); + }, [jsonContent]); + + if (isLoading) + return ( + + + + ); + + return ( + {t('projects.run.inspect')}
}> + { + // Prevent editing - onChange is required but we ignore changes + }} + /> + + ); +}; diff --git a/frontend/src/pages/Runs/Details/constants.ts b/frontend/src/pages/Runs/Details/constants.ts index 1bf4bc69c0..7a63d3f95c 100644 --- a/frontend/src/pages/Runs/Details/constants.ts +++ b/frontend/src/pages/Runs/Details/constants.ts @@ -3,4 +3,5 @@ export enum CodeTab { Metrics = 'metrics', Logs = 'logs', Events = 'events', + Inspect = 'inspect', } diff --git a/frontend/src/pages/Runs/Details/index.tsx b/frontend/src/pages/Runs/Details/index.tsx index 78e9850c8e..5195b4fdc0 100644 --- a/frontend/src/pages/Runs/Details/index.tsx +++ b/frontend/src/pages/Runs/Details/index.tsx @@ -189,6 +189,11 @@ export const RunDetailsPage: React.FC = () => { id: CodeTab.Events, href: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.EVENTS.FORMAT(paramProjectName, paramRunId), }, + { + label: 'Inspect', + id: CodeTab.Inspect, + href: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.INSPECT.FORMAT(paramProjectName, paramRunId), + }, ]} /> )} diff --git a/frontend/src/router.tsx b/frontend/src/router.tsx index 1bba4cb161..fbdeca2942 100644 --- a/frontend/src/router.tsx +++ b/frontend/src/router.tsx @@ -13,6 +13,7 @@ import { Logout } from 'App/Logout'; import { FleetDetails, FleetList } from 'pages/Fleets'; import { EventsList as FleetEventsList } from 'pages/Fleets/Details/Events'; import { FleetDetails as FleetDetailsGeneral } from 'pages/Fleets/Details/FleetDetails'; +import { FleetInspect } from 'pages/Fleets/Details/Inspect'; import { InstanceList } from 'pages/Instances'; import { ModelsList } from 'pages/Models'; import { ModelDetails } from 'pages/Models/Details'; @@ -28,6 +29,7 @@ import { RunDetailsPage, RunList, } from 'pages/Runs'; +import { RunInspect } from 'pages/Runs/Details/Inspect'; import { JobDetailsPage } from 'pages/Runs/Details/Jobs/Details'; import { EventsList as JobEvents } from 'pages/Runs/Details/Jobs/Events'; import { CreditsHistoryAdd, UserAdd, UserDetails, UserEdit, UserList } from 'pages/User'; @@ -122,6 +124,10 @@ export const router = createBrowserRouter([ path: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.EVENTS.TEMPLATE, element: , }, + { + path: ROUTES.PROJECT.DETAILS.RUNS.DETAILS.INSPECT.TEMPLATE, + element: , + }, ], }, { @@ -208,6 +214,10 @@ export const router = createBrowserRouter([ path: ROUTES.FLEETS.DETAILS.EVENTS.TEMPLATE, element: , }, + { + path: ROUTES.FLEETS.DETAILS.INSPECT.TEMPLATE, + element: , + }, ], }, diff --git a/frontend/src/routes.ts b/frontend/src/routes.ts index 6bc1fb0e5a..fea2f978a4 100644 --- a/frontend/src/routes.ts +++ b/frontend/src/routes.ts @@ -43,6 +43,11 @@ export const ROUTES = { FORMAT: (projectName: string, runId: string) => buildRoute(ROUTES.PROJECT.DETAILS.RUNS.DETAILS.LOGS.TEMPLATE, { projectName, runId }), }, + INSPECT: { + TEMPLATE: `/projects/:projectName/runs/:runId/inspect`, + FORMAT: (projectName: string, runId: string) => + buildRoute(ROUTES.PROJECT.DETAILS.RUNS.DETAILS.INSPECT.TEMPLATE, { projectName, runId }), + }, JOBS: { DETAILS: { TEMPLATE: `/projects/:projectName/runs/:runId/jobs/:jobName`, @@ -141,6 +146,11 @@ export const ROUTES = { FORMAT: (projectName: string, fleetId: string) => buildRoute(ROUTES.FLEETS.DETAILS.EVENTS.TEMPLATE, { projectName, fleetId }), }, + INSPECT: { + TEMPLATE: `/projects/:projectName/fleets/:fleetId/inspect`, + FORMAT: (projectName: string, fleetId: string) => + buildRoute(ROUTES.FLEETS.DETAILS.INSPECT.TEMPLATE, { projectName, fleetId }), + }, }, }, diff --git a/src/dstack/_internal/cli/commands/fleet.py b/src/dstack/_internal/cli/commands/fleet.py index c6a11abc3a..130e2c3fcf 100644 --- a/src/dstack/_internal/cli/commands/fleet.py +++ b/src/dstack/_internal/cli/commands/fleet.py @@ -1,5 +1,6 @@ import argparse import time +from uuid import UUID from rich.live import Live @@ -12,7 +13,8 @@ console, ) from dstack._internal.cli.utils.fleet import get_fleets_table, print_fleets_table -from dstack._internal.core.errors import ResourceNotExistsError +from dstack._internal.core.errors import CLIError, ResourceNotExistsError +from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent class FleetCommand(APIBaseCommand): @@ -63,6 +65,29 @@ def _register(self): ) delete_parser.set_defaults(subfunc=self._delete) + get_parser = subparsers.add_parser( + "get", help="Get a fleet", formatter_class=self._parser.formatter_class + ) + name_group = get_parser.add_mutually_exclusive_group(required=True) + name_group.add_argument( + "name", + nargs="?", + metavar="NAME", + help="The name of the fleet", + ).completer = FleetNameCompleter() # type: ignore[attr-defined] + name_group.add_argument( + "--id", + type=str, + help="The ID of the fleet (UUID)", + ) + get_parser.add_argument( + "--json", + action="store_true", + required=True, + help="Output in JSON format", + ) + get_parser.set_defaults(subfunc=self._get) + def _command(self, args: argparse.Namespace): super()._command(args) args.subfunc(args) @@ -112,3 +137,25 @@ def _delete(self, args: argparse.Namespace): ) console.print(f"Fleet [code]{args.name}[/] instances deleted") + + def _get(self, args: argparse.Namespace): + # TODO: Implement non-json output format + fleet_id = None + if args.id is not None: + try: + fleet_id = UUID(args.id) + except ValueError: + raise CLIError(f"Invalid UUID format: {args.id}") + + try: + if args.id is not None: + fleet = self.api.client.fleets.get( + project_name=self.api.project, fleet_id=fleet_id + ) + else: + fleet = self.api.client.fleets.get(project_name=self.api.project, name=args.name) + except ResourceNotExistsError: + console.print(f"Fleet [code]{args.name or args.id}[/] not found") + exit(1) + + print(pydantic_orjson_dumps_with_indent(fleet.dict(), default=None)) diff --git a/src/dstack/_internal/cli/commands/gateway.py b/src/dstack/_internal/cli/commands/gateway.py index 31ecef3ddf..be7e6138a1 100644 --- a/src/dstack/_internal/cli/commands/gateway.py +++ b/src/dstack/_internal/cli/commands/gateway.py @@ -16,7 +16,8 @@ print_gateways_json, print_gateways_table, ) -from dstack._internal.core.errors import CLIError +from dstack._internal.core.errors import CLIError, ResourceNotExistsError +from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) @@ -83,6 +84,20 @@ def _register(self): ) update_parser.add_argument("--domain", help="Set the domain for the gateway") + get_parser = subparsers.add_parser( + "get", help="Get a gateway", formatter_class=self._parser.formatter_class + ) + get_parser.add_argument( + "name", metavar="NAME", help="The name of the gateway" + ).completer = GatewayNameCompleter() # type: ignore[attr-defined] + get_parser.add_argument( + "--json", + action="store_true", + required=True, + help="Output in JSON format", + ) + get_parser.set_defaults(subfunc=self._get) + def _command(self, args: argparse.Namespace): super()._command(args) # TODO handle errors @@ -130,3 +145,15 @@ def _update(self, args: argparse.Namespace): ) gateway = self.api.client.gateways.get(self.api.project, args.name) print_gateways_table([gateway]) + + def _get(self, args: argparse.Namespace): + # TODO: Implement non-json output format + try: + gateway = self.api.client.gateways.get( + project_name=self.api.project, gateway_name=args.name + ) + except ResourceNotExistsError: + console.print("Gateway not found") + exit(1) + + print(pydantic_orjson_dumps_with_indent(gateway.dict(), default=None)) diff --git a/src/dstack/_internal/cli/commands/run.py b/src/dstack/_internal/cli/commands/run.py new file mode 100644 index 0000000000..337b0a75cf --- /dev/null +++ b/src/dstack/_internal/cli/commands/run.py @@ -0,0 +1,69 @@ +import argparse +from uuid import UUID + +from dstack._internal.cli.commands import APIBaseCommand +from dstack._internal.cli.services.completion import RunNameCompleter +from dstack._internal.cli.utils.common import console +from dstack._internal.core.errors import CLIError, ResourceNotExistsError +from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent + + +class RunCommand(APIBaseCommand): + NAME = "run" + DESCRIPTION = "Manage runs" + + def _register(self): + super()._register() + subparsers = self._parser.add_subparsers(dest="action") + + # TODO: Add `list` subcommand and make `dstack ps` an alias to `dstack run list` + + get_parser = subparsers.add_parser( + "get", help="Get a run", formatter_class=self._parser.formatter_class + ) + name_group = get_parser.add_mutually_exclusive_group(required=True) + name_group.add_argument( + "name", + nargs="?", + metavar="NAME", + help="The name of the run", + ).completer = RunNameCompleter() # type: ignore[attr-defined] + name_group.add_argument( + "--id", + type=str, + help="The ID of the run (UUID)", + ) + get_parser.add_argument( + "--json", + action="store_true", + required=True, + help="Output in JSON format", + ) + get_parser.set_defaults(subfunc=self._get) + + def _command(self, args: argparse.Namespace): + super()._command(args) + if hasattr(args, "subfunc"): + args.subfunc(args) + else: + self._parser.print_help() + + def _get(self, args: argparse.Namespace): + # TODO: Implement non-json output format + run_id = None + if args.id is not None: + try: + run_id = UUID(args.id) + except ValueError: + raise CLIError(f"Invalid UUID format: {args.id}") + + try: + if args.id is not None: + run = self.api.client.runs.get(project_name=self.api.project, run_id=run_id) + else: + run = self.api.client.runs.get(project_name=self.api.project, run_name=args.name) + except ResourceNotExistsError: + console.print(f"Run [code]{args.name or args.id}[/] not found") + exit(1) + + print(pydantic_orjson_dumps_with_indent(run.dict(), default=None)) diff --git a/src/dstack/_internal/cli/commands/volume.py b/src/dstack/_internal/cli/commands/volume.py index 3f7da2e00a..e78ec352c6 100644 --- a/src/dstack/_internal/cli/commands/volume.py +++ b/src/dstack/_internal/cli/commands/volume.py @@ -13,6 +13,7 @@ ) from dstack._internal.cli.utils.volume import get_volumes_table, print_volumes_table from dstack._internal.core.errors import ResourceNotExistsError +from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent class VolumeCommand(APIBaseCommand): @@ -54,6 +55,22 @@ def _register(self): ) delete_parser.set_defaults(subfunc=self._delete) + get_parser = subparsers.add_parser( + "get", help="Get a volume", formatter_class=self._parser.formatter_class + ) + get_parser.add_argument( + "name", + metavar="NAME", + help="The name of the volume", + ).completer = VolumeNameCompleter() # type: ignore[attr-defined] + get_parser.add_argument( + "--json", + action="store_true", + required=True, + help="Output in JSON format", + ) + get_parser.set_defaults(subfunc=self._get) + def _command(self, args: argparse.Namespace): super()._command(args) args.subfunc(args) @@ -88,3 +105,13 @@ def _delete(self, args: argparse.Namespace): self.api.client.volumes.delete(project_name=self.api.project, names=[args.name]) console.print(f"Volume [code]{args.name}[/] deleted") + + def _get(self, args: argparse.Namespace): + # TODO: Implement non-json output format + try: + volume = self.api.client.volumes.get(project_name=self.api.project, name=args.name) + except ResourceNotExistsError: + console.print("Volume not found") + exit(1) + + print(pydantic_orjson_dumps_with_indent(volume.dict(), default=None)) diff --git a/src/dstack/_internal/cli/main.py b/src/dstack/_internal/cli/main.py index 61f3967ab7..a5f678a98e 100644 --- a/src/dstack/_internal/cli/main.py +++ b/src/dstack/_internal/cli/main.py @@ -18,6 +18,7 @@ from dstack._internal.cli.commands.offer import OfferCommand from dstack._internal.cli.commands.project import ProjectCommand from dstack._internal.cli.commands.ps import PsCommand +from dstack._internal.cli.commands.run import RunCommand from dstack._internal.cli.commands.secrets import SecretCommand from dstack._internal.cli.commands.server import ServerCommand from dstack._internal.cli.commands.stop import StopCommand @@ -74,6 +75,7 @@ def main(): MetricsCommand.register(subparsers) ProjectCommand.register(subparsers) PsCommand.register(subparsers) + RunCommand.register(subparsers) SecretCommand.register(subparsers) ServerCommand.register(subparsers) StopCommand.register(subparsers) diff --git a/src/dstack/api/server/_fleets.py b/src/dstack/api/server/_fleets.py index 8f6ea7fcfa..9bfb1cb422 100644 --- a/src/dstack/api/server/_fleets.py +++ b/src/dstack/api/server/_fleets.py @@ -1,4 +1,5 @@ -from typing import List, Union +from typing import List, Optional, Union +from uuid import UUID from pydantic import parse_obj_as @@ -24,8 +25,14 @@ def list(self, project_name: str) -> List[Fleet]: resp = self._request(f"/api/project/{project_name}/fleets/list") return parse_obj_as(List[Fleet.__response__], resp.json()) - def get(self, project_name: str, name: str) -> Fleet: - body = GetFleetRequest(name=name) + def get( + self, project_name: str, name: Optional[str] = None, fleet_id: Optional[UUID] = None + ) -> Fleet: + if name is None and fleet_id is None: + raise ValueError("Either name or fleet_id must be provided") + if name is not None and fleet_id is not None: + raise ValueError("Cannot specify both name and fleet_id") + body = GetFleetRequest(name=name, id=fleet_id) resp = self._request( f"/api/project/{project_name}/fleets/get", body=body.json(), diff --git a/src/dstack/api/server/_runs.py b/src/dstack/api/server/_runs.py index 745ce9c782..ead179763b 100644 --- a/src/dstack/api/server/_runs.py +++ b/src/dstack/api/server/_runs.py @@ -57,8 +57,14 @@ def list( ) return parse_obj_as(List[Run.__response__], resp.json()) - def get(self, project_name: str, run_name: str) -> Run: - body = GetRunRequest(run_name=run_name) + def get( + self, project_name: str, run_name: Optional[str] = None, run_id: Optional[UUID] = None + ) -> Run: + if run_name is None and run_id is None: + raise ValueError("Either run_name or run_id must be provided") + if run_name is not None and run_id is not None: + raise ValueError("Cannot specify both run_name and run_id") + body = GetRunRequest(run_name=run_name, id=run_id) json_body = body.json() resp = self._request(f"/api/project/{project_name}/runs/get", body=json_body) return parse_obj_as(Run.__response__, resp.json()) From 3be819be51a8e40a8588f0a172e62980089b8666 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Mon, 12 Jan 2026 17:30:27 +0500 Subject: [PATCH 043/187] Use the same metrics endpoint label for 404 requests (#3455) * Use the same metrics endpoint label for 404 requests * Leave comment on high cardinality labels --- src/dstack/_internal/server/app.py | 16 ++++++++++++++-- .../_internal/server/routers/prometheus.py | 3 +++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/dstack/_internal/server/app.py b/src/dstack/_internal/server/app.py index 527dd128fe..488a5a9e0e 100644 --- a/src/dstack/_internal/server/app.py +++ b/src/dstack/_internal/server/app.py @@ -306,19 +306,31 @@ def _extract_project_name(request: Request): return project_name + def _extract_endpoint_label(request: Request, response: Response) -> str: + route = request.scope.get("route") + route_path = getattr(route, "path", None) + if route_path: + return route_path + if not request.url.path.startswith("/api/"): + return "__non_api__" + if response.status_code == status.HTTP_404_NOT_FOUND: + return "__not_found__" + return "__unmatched__" + project_name = _extract_project_name(request) response: Response = await call_next(request) + endpoint_label = _extract_endpoint_label(request, response) REQUEST_DURATION.labels( method=request.method, - endpoint=request.url.path, + endpoint=endpoint_label, http_status=response.status_code, project_name=project_name, ).observe(request.state.process_time) REQUESTS_TOTAL.labels( method=request.method, - endpoint=request.url.path, + endpoint=endpoint_label, http_status=response.status_code, project_name=project_name, ).inc() diff --git a/src/dstack/_internal/server/routers/prometheus.py b/src/dstack/_internal/server/routers/prometheus.py index a5538edfec..da0115eb77 100644 --- a/src/dstack/_internal/server/routers/prometheus.py +++ b/src/dstack/_internal/server/routers/prometheus.py @@ -25,6 +25,9 @@ async def get_prometheus_metrics( session: Annotated[AsyncSession, Depends(get_session)], ) -> str: + # Note: Prometheus warns against storing high cardinality values in labels, + # yet both client and custom metrics have labels like project, run, fleet, etc. + # This may require a very big Prometheus server with lots of storage. if not settings.ENABLE_PROMETHEUS_METRICS: raise error_not_found() custom_metrics_ = await custom_metrics.get_metrics(session=session) From fae73ce0d2f6441b3e8fa11551016f90ec420caf Mon Sep 17 00:00:00 2001 From: Oleg Date: Mon, 12 Jan 2026 22:50:25 +0300 Subject: [PATCH 044/187] Refactoring Inspect page (#3457) --- .../{form => }/CodeEditor/constants.ts | 0 frontend/src/components/CodeEditor/index.tsx | 55 +++++++++++++++++++ .../src/components/form/CodeEditor/index.tsx | 45 +-------------- .../src/components/form/CodeEditor/types.ts | 3 +- frontend/src/components/index.ts | 2 + .../pages/Fleets/Details/Inspect/index.tsx | 44 +-------------- .../src/pages/Runs/Details/Inspect/index.tsx | 44 +-------------- 7 files changed, 65 insertions(+), 128 deletions(-) rename frontend/src/components/{form => }/CodeEditor/constants.ts (100%) create mode 100644 frontend/src/components/CodeEditor/index.tsx diff --git a/frontend/src/components/form/CodeEditor/constants.ts b/frontend/src/components/CodeEditor/constants.ts similarity index 100% rename from frontend/src/components/form/CodeEditor/constants.ts rename to frontend/src/components/CodeEditor/constants.ts diff --git a/frontend/src/components/CodeEditor/index.tsx b/frontend/src/components/CodeEditor/index.tsx new file mode 100644 index 0000000000..f8d9daf385 --- /dev/null +++ b/frontend/src/components/CodeEditor/index.tsx @@ -0,0 +1,55 @@ +import React, { useEffect, useState } from 'react'; +import ace from 'ace-builds'; +import GeneralCodeEditor, { CodeEditorProps as GeneralCodeEditorProps } from '@cloudscape-design/components/code-editor'; + +ace.config.set('useWorker', false); + +import { Mode } from '@cloudscape-design/global-styles'; + +import { useAppSelector } from 'hooks'; + +import { selectSystemMode } from 'App/slice'; + +import { CODE_EDITOR_I18N_STRINGS } from './constants'; + +import 'ace-builds/src-noconflict/theme-cloud_editor'; +import 'ace-builds/src-noconflict/theme-cloud_editor_dark'; +import 'ace-builds/src-noconflict/mode-yaml'; +import 'ace-builds/src-noconflict/mode-json'; +import 'ace-builds/src-noconflict/ext-language_tools'; + +export type CodeEditorProps = Omit; + +export const CodeEditor: React.FC = (props) => { + const systemMode = useAppSelector(selectSystemMode) ?? ''; + + const [codeEditorPreferences, setCodeEditorPreferences] = useState(() => ({ + theme: systemMode === Mode.Dark ? 'cloud_editor_dark' : 'cloud_editor', + })); + + useEffect(() => { + if (systemMode === Mode.Dark) + setCodeEditorPreferences({ + theme: 'cloud_editor_dark', + }); + else + setCodeEditorPreferences({ + theme: 'cloud_editor', + }); + }, [systemMode]); + + const onCodeEditorPreferencesChange: GeneralCodeEditorProps['onPreferencesChange'] = (e) => { + setCodeEditorPreferences(e.detail); + }; + + return ( + + ); +}; diff --git a/frontend/src/components/form/CodeEditor/index.tsx b/frontend/src/components/form/CodeEditor/index.tsx index 4d23ea1012..254c960d00 100644 --- a/frontend/src/components/form/CodeEditor/index.tsx +++ b/frontend/src/components/form/CodeEditor/index.tsx @@ -1,26 +1,11 @@ -import React, { useEffect, useState } from 'react'; +import React from 'react'; import { Controller, FieldValues } from 'react-hook-form'; -import ace from 'ace-builds'; -import CodeEditor, { CodeEditorProps } from '@cloudscape-design/components/code-editor'; import FormField from '@cloudscape-design/components/form-field'; -import { CODE_EDITOR_I18N_STRINGS } from './constants'; +import { CodeEditor } from '../../CodeEditor'; import { FormCodeEditorProps } from './types'; -ace.config.set('useWorker', false); - -import { Mode } from '@cloudscape-design/global-styles'; - -import { useAppSelector } from 'hooks'; - -import { selectSystemMode } from 'App/slice'; - -import 'ace-builds/src-noconflict/theme-cloud_editor'; -import 'ace-builds/src-noconflict/theme-cloud_editor_dark'; -import 'ace-builds/src-noconflict/mode-yaml'; -import 'ace-builds/src-noconflict/ext-language_tools'; - export const FormCodeEditor = ({ name, control, @@ -34,27 +19,6 @@ export const FormCodeEditor = ({ onChange: onChangeProp, ...props }: FormCodeEditorProps) => { - const systemMode = useAppSelector(selectSystemMode) ?? ''; - - const [codeEditorPreferences, setCodeEditorPreferences] = useState(() => ({ - theme: systemMode === Mode.Dark ? 'cloud_editor_dark' : 'cloud_editor', - })); - - useEffect(() => { - if (systemMode === Mode.Dark) - setCodeEditorPreferences({ - theme: 'cloud_editor_dark', - }); - else - setCodeEditorPreferences({ - theme: 'cloud_editor', - }); - }, [systemMode]); - - const onCodeEditorPreferencesChange: CodeEditorProps['onPreferencesChange'] = (e) => { - setCodeEditorPreferences(e.detail); - }; - return ( ({ { onChange(event.detail.value); onChangeProp?.(event); }} - themes={{ light: [], dark: [] }} - preferences={codeEditorPreferences} - onPreferencesChange={onCodeEditorPreferencesChange} /> ); diff --git a/frontend/src/components/form/CodeEditor/types.ts b/frontend/src/components/form/CodeEditor/types.ts index 380c009c56..baedd567b8 100644 --- a/frontend/src/components/form/CodeEditor/types.ts +++ b/frontend/src/components/form/CodeEditor/types.ts @@ -1,7 +1,8 @@ import { ControllerProps, FieldValues } from 'react-hook-form'; -import { CodeEditorProps } from '@cloudscape-design/components/code-editor'; import { FormFieldProps } from '@cloudscape-design/components/form-field'; +import { CodeEditorProps } from '../../CodeEditor'; + export type FormCodeEditorProps = Omit< CodeEditorProps, 'value' | 'name' | 'i18nStrings' | 'ace' | 'onPreferencesChange' | 'preferences' diff --git a/frontend/src/components/index.ts b/frontend/src/components/index.ts index 70d240a25e..c8aa4013fb 100644 --- a/frontend/src/components/index.ts +++ b/frontend/src/components/index.ts @@ -88,6 +88,8 @@ export type { FormCardsProps } from './form/Cards/types'; export { FormCards } from './form/Cards'; export { Notifications } from './Notifications'; export { ConfirmationDialog } from './ConfirmationDialog'; +export { CodeEditor } from './CodeEditor'; +export type { CodeEditorProps } from './CodeEditor'; export { FileUploader } from './FileUploader'; export { InfoLink } from './InfoLink'; export { ButtonWithConfirmation } from './ButtonWithConfirmation'; diff --git a/frontend/src/pages/Fleets/Details/Inspect/index.tsx b/frontend/src/pages/Fleets/Details/Inspect/index.tsx index 844ebe849d..8d9c5d5095 100644 --- a/frontend/src/pages/Fleets/Details/Inspect/index.tsx +++ b/frontend/src/pages/Fleets/Details/Inspect/index.tsx @@ -1,25 +1,11 @@ -import React, { useEffect, useMemo, useState } from 'react'; +import React, { useEffect, useMemo } from 'react'; import { useTranslation } from 'react-i18next'; import { useParams } from 'react-router-dom'; -import ace from 'ace-builds'; -import CodeEditor, { CodeEditorProps } from '@cloudscape-design/components/code-editor'; -import { Mode } from '@cloudscape-design/global-styles'; -import { Container, Header, Loader } from 'components'; -import { CODE_EDITOR_I18N_STRINGS } from 'components/form/CodeEditor/constants'; +import { CodeEditor, Container, Header, Loader } from 'components'; -import { useAppSelector } from 'hooks'; import { useGetFleetDetailsQuery } from 'services/fleet'; -import { selectSystemMode } from 'App/slice'; - -import 'ace-builds/src-noconflict/theme-cloud_editor'; -import 'ace-builds/src-noconflict/theme-cloud_editor_dark'; -import 'ace-builds/src-noconflict/mode-json'; -import 'ace-builds/src-noconflict/ext-language_tools'; - -ace.config.set('useWorker', false); - interface AceEditorElement extends HTMLElement { env?: { editor?: { @@ -34,8 +20,6 @@ export const FleetInspect = () => { const paramProjectName = params.projectName ?? ''; const paramFleetId = params.fleetId ?? ''; - const systemMode = useAppSelector(selectSystemMode) ?? ''; - const { data: fleetData, isLoading } = useGetFleetDetailsQuery( { projectName: paramProjectName, @@ -46,25 +30,6 @@ export const FleetInspect = () => { }, ); - const [codeEditorPreferences, setCodeEditorPreferences] = useState(() => ({ - theme: systemMode === Mode.Dark ? 'cloud_editor_dark' : 'cloud_editor', - })); - - useEffect(() => { - if (systemMode === Mode.Dark) - setCodeEditorPreferences({ - theme: 'cloud_editor_dark', - }); - else - setCodeEditorPreferences({ - theme: 'cloud_editor', - }); - }, [systemMode]); - - const onCodeEditorPreferencesChange: CodeEditorProps['onPreferencesChange'] = (e) => { - setCodeEditorPreferences(e.detail); - }; - const jsonContent = useMemo(() => { if (!fleetData) return ''; return JSON.stringify(fleetData, null, 2); @@ -98,11 +63,6 @@ export const FleetInspect = () => { { // Prevent editing - onChange is required but we ignore changes diff --git a/frontend/src/pages/Runs/Details/Inspect/index.tsx b/frontend/src/pages/Runs/Details/Inspect/index.tsx index f37aa90ad3..5dc9e9a46b 100644 --- a/frontend/src/pages/Runs/Details/Inspect/index.tsx +++ b/frontend/src/pages/Runs/Details/Inspect/index.tsx @@ -1,25 +1,11 @@ -import React, { useEffect, useMemo, useState } from 'react'; +import React, { useEffect, useMemo } from 'react'; import { useTranslation } from 'react-i18next'; import { useParams } from 'react-router-dom'; -import ace from 'ace-builds'; -import CodeEditor, { CodeEditorProps } from '@cloudscape-design/components/code-editor'; -import { Mode } from '@cloudscape-design/global-styles'; -import { Container, Header, Loader } from 'components'; -import { CODE_EDITOR_I18N_STRINGS } from 'components/form/CodeEditor/constants'; +import { CodeEditor, Container, Header, Loader } from 'components'; -import { useAppSelector } from 'hooks'; import { useGetRunQuery } from 'services/run'; -import { selectSystemMode } from 'App/slice'; - -import 'ace-builds/src-noconflict/theme-cloud_editor'; -import 'ace-builds/src-noconflict/theme-cloud_editor_dark'; -import 'ace-builds/src-noconflict/mode-json'; -import 'ace-builds/src-noconflict/ext-language_tools'; - -ace.config.set('useWorker', false); - interface AceEditorElement extends HTMLElement { env?: { editor?: { @@ -34,32 +20,11 @@ export const RunInspect = () => { const paramProjectName = params.projectName ?? ''; const paramRunId = params.runId ?? ''; - const systemMode = useAppSelector(selectSystemMode) ?? ''; - const { data: runData, isLoading } = useGetRunQuery({ project_name: paramProjectName, id: paramRunId, }); - const [codeEditorPreferences, setCodeEditorPreferences] = useState(() => ({ - theme: systemMode === Mode.Dark ? 'cloud_editor_dark' : 'cloud_editor', - })); - - useEffect(() => { - if (systemMode === Mode.Dark) - setCodeEditorPreferences({ - theme: 'cloud_editor_dark', - }); - else - setCodeEditorPreferences({ - theme: 'cloud_editor', - }); - }, [systemMode]); - - const onCodeEditorPreferencesChange: CodeEditorProps['onPreferencesChange'] = (e) => { - setCodeEditorPreferences(e.detail); - }; - const jsonContent = useMemo(() => { if (!runData) return ''; return JSON.stringify(runData, null, 2); @@ -93,11 +58,6 @@ export const RunInspect = () => { { // Prevent editing - onChange is required but we ignore changes From 9e5b3b321e62a37b765ce5ac3de18b9471419b3f Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Tue, 13 Jan 2026 13:55:35 +0100 Subject: [PATCH 045/187] Migrate from Slurm (#3454) * Slurm guide - work in progress * Linter * Minor update * Minor edits + review around containers use with Slurm * Minor styling changes * Minor edit - introduction * Minor changes --- docs/assets/stylesheets/extra.css | 17 +- docs/docs/guides/migration/slurm.md | 1850 +++++++++++++++++ docs/docs/guides/{migration.md => upgrade.md} | 2 +- docs/layouts/custom.yml | 26 +- docs/overrides/home.html | 18 +- mkdocs.yml | 7 +- 6 files changed, 1883 insertions(+), 37 deletions(-) create mode 100644 docs/docs/guides/migration/slurm.md rename docs/docs/guides/{migration.md => upgrade.md} (99%) diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index 99655a1fe7..e0a16fcec5 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -782,10 +782,10 @@ body { } .md-sidebar--primary .md-nav__item--section.md-nav__item .md-nav__link--active { - border-left: 2.5px solid var(--md-typeset-a-color); + border-left: 3px solid var(--md-typeset-a-color); color: inherit; border-image: linear-gradient(8deg, #0048ff, #ce00ff, #ce00ff, #ce00ff) 10; - margin-left: -1.5px; + margin-left: -2px; font-size: 16.5px; padding-left: 14px; } @@ -857,8 +857,9 @@ body { .md-nav[data-md-level="2"] > .md-nav__list > .md-nav__item { /*margin-left: -16px !important;*/ - border-left: 0.5px dotted rgba(0, 0, 0, 0.4); + border-left: 0.5px dotted rgba(0, 0, 0, 1); /*background: red;*/ + margin-bottom: 0.5px; } .md-nav[data-md-level="3"] > .md-nav__list > .md-nav__item:last-of-type { @@ -866,7 +867,7 @@ body { } .md-sidebar--primary .md-nav__link, .md-sidebar--post .md-nav__link { - padding: 4px 15px 4px; + padding: 2px 15px 4px; margin-top: 0; } @@ -991,7 +992,8 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { } .md-nav--primary .md-nav__list { - padding-bottom: .2rem; + padding-top: .15rem; + padding-bottom: .3rem; } } @@ -1285,9 +1287,8 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { content: ""; width: 100%; z-index: 1000; - height: 2.5px; - bottom: -4.5px; - border-radius: 2px; + height: 3px; + bottom: -5px; } .md-tabs[hidden] .md-tabs__link { diff --git a/docs/docs/guides/migration/slurm.md b/docs/docs/guides/migration/slurm.md new file mode 100644 index 0000000000..82c1548a4b --- /dev/null +++ b/docs/docs/guides/migration/slurm.md @@ -0,0 +1,1850 @@ +--- +title: Migrate from Slurm +description: This guide compares Slurm and dstack, and shows how to orchestrate equivalent GPU-based workloads using dstack. +--- + +# Migrate from Slurm + +Both Slurm and `dstack` are open-source workload orchestration systems designed to manage compute resources and schedule jobs. This guide compares Slurm and `dstack`, maps features between the two systems, and shows their `dstack` equivalents. + +!!! tip "Slurm vs dstack" + Slurm is a battle-tested system with decades of production use in HPC environments. `dstack` is designed for modern ML/AI workloads with cloud-native provisioning and container-first architecture. Slurm is better suited for traditional HPC centers with static clusters; `dstack` is better suited for cloud-native ML teams working with cloud GPUs. Both systems can handle distributed training and batch workloads. + +| | Slurm | dstack | +|---|-------|--------| +| **Provisioning** | Pre-configured static clusters; cloud requires third-party integrations with potential limitations | Native integration with top GPU clouds; automatically provisions clusters on demand | +| **Containers** | Optional via plugins | Built around containers from the ground up | +| **Use cases** | Batch job scheduling and distributed training | Interactive development, distributed training, and production inference services | +| **Personas** | HPC centers, academic institutions, research labs | ML engineering teams, AI startups, cloud-native organizations | + +While `dstack` is designed to be use-case agnostic and supports both development and production-grade inference, this guide focuses specifically on training workloads. + +## Architecture + +Both Slurm and `dstack` follow a client-server architecture with a control plane and a compute plane running on cluster instances. + +| | Slurm | dstack | +|---|---------------|-------------------| +| **Control plane** | `slurmctld` (controller) | `dstack-server` | +| **State persistence** | `slurmdbd` (database) | `dstack-server` (SQLite/PostgreSQL) | +| **REST API** | `slurmrestd` (REST API) | `dstack-server` (HTTP API) | +| **Compute plane** | `slurmd` (compute agent) | `dstack-shim` (on VMs/hosts) and/or `dstack-runner` (inside containers) | +| **Client** | CLI from login nodes | CLI from anywhere | +| **High availability** | Active-passive failover (typically 2 controller nodes) | Horizontal scaling with multiple server replicas (requires PostgreSQL) | + +## Job configuration and submission + +Both Slurm and `dstack` allow defining jobs as files and submitting them via CLI. + +### Slurm + +Slurm uses shell scripts with `#SBATCH` directives embedded in the script: + +
+ +```bash +#!/bin/bash +#SBATCH --job-name=train-model +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:1 +#SBATCH --mem=32G +#SBATCH --time=2:00:00 +#SBATCH --partition=gpu +#SBATCH --output=train-%j.out +#SBATCH --error=train-%j.err + +export HF_TOKEN +export LEARNING_RATE=0.001 + +module load python/3.9 +srun python train.py --batch-size=64 +``` + +
+ +Submit the job from a login node (with environment variables that override script defaults): + +
+ +```shell +$ sbatch --export=ALL,LEARNING_RATE=0.002 train.sh + Submitted batch job 12346 +``` + +
+ +### dstack + +`dstack` uses declarative YAML configuration files: + +
+ +```yaml +type: task +name: train-model + +python: 3.9 +repos: + - . + +env: + - HF_TOKEN + - LEARNING_RATE=0.001 + +commands: + - python train.py --batch-size=64 + +resources: + gpu: 1 + memory: 32GB + cpu: 8 + shm_size: 8GB + +max_duration: 2h +``` + +
+ +Submit the job from anywhere (laptop, CI/CD) via the CLI. `dstack apply` allows overriding various options and runs in attached mode by default, streaming job output in real-time: + +
+ +```shell +$ dstack apply -f .dstack.yml --env LEARNING_RATE=0.002 + + # BACKEND REGION RESOURCES SPOT PRICE + 1 aws us-east-1 4xCPU, 16GB, T4:1 yes $0.10 + +Submit the run train-model? [y/n]: y + +Launching `train-model`... +---> 100% +``` + +
+ +### Configuration comparison + +| | Slurm | dstack | +|---|-------|--------| +| **File type** | Shell script with `#SBATCH` directives | YAML configuration file (`.dstack.yml`) | +| **GPU** | `--gres=gpu:N` or `--gres=gpu:type:N` | `gpu: A100:80GB:4` or `gpu: 40GB..80GB:2..8` (supports ranges) | +| **Memory** | `--mem=M` (per node) or `--mem-per-cpu=M` | `memory: 200GB..` (range, per node, minimum requirement) | +| **CPU** | `--cpus-per-task=C` or `--ntasks` | `cpu: 32` (per node) | +| **Shared memory** | Configured on host | `shm_size: 24GB` (explicit) | +| **Duration** | `--time=2:00:00` | `max_duration: 2h` (both enforce walltime) | +| **Cluster** | `--partition=gpu` | `fleets: [gpu]` (see Partitions and fleets below) | +| **Output** | `--output=train-%j.out` (writes files) | `dstack logs` or UI (streams via API) | +| **Working directory** | `--chdir=/path/to/dir` or defaults to submission directory | `working_dir: /path/to/dir` (defaults to image's working directory, typically `/dstack/run`) | +| **Environment variables** | `export VAR` or `--export=ALL,VAR=value` | `env: - VAR` or `--env VAR=value` | +| **Node exclusivity** | `--exclusive` (entire node) | Automatic if `blocks` is not used or job uses all blocks; required for distributed tasks (`nodes` > 1) | + +> For multi-node examples, see [Distributed training](#distributed-training) below. + +## Containers + +### Slurm + +By default, Slurm runs jobs on compute nodes using the host OS with cgroups for resource isolation and full access to the host filesystem. Container execution is optional via plugins but require explicit filesystem mounts. + +=== "Singularity/Apptainer" + + Container image must exist on shared filesystem. Mount host directories with `--container-mounts`: + + ```bash + #!/bin/bash + #SBATCH --nodes=1 + #SBATCH --gres=gpu:1 + #SBATCH --mem=32G + #SBATCH --time=2:00:00 + + srun --container-image=/shared/images/pytorch-2.0-cuda11.8.sif \ + --container-mounts=/shared/datasets:/datasets,/shared/checkpoints:/checkpoints \ + python train.py --batch-size=64 + ``` + +=== "Pyxis with Enroot" + + Pyxis plugin pulls images from Docker registry. Mount host directories with `--container-mounts`: + + ```bash + #!/bin/bash + #SBATCH --nodes=1 + #SBATCH --gres=gpu:1 + #SBATCH --mem=32G + #SBATCH --time=2:00:00 + + srun --container-image=pytorch/pytorch:2.0.0-cuda11.8-cudnn8-runtime \ + --container-mounts=/shared/datasets:/datasets,/shared/checkpoints:/checkpoints \ + python train.py --batch-size=64 + ``` + +=== "Enroot" + + Pulls images from registry. Mount host directories with `--container-mounts`: + + ```bash + #!/bin/bash + #SBATCH --nodes=1 + #SBATCH --gres=gpu:1 + #SBATCH --mem=32G + #SBATCH --time=2:00:00 + + srun --container-image=docker://pytorch/pytorch:2.0.0-cuda11.8-cudnn8-runtime \ + --container-mounts=/shared/datasets:/datasets,/shared/checkpoints:/checkpoints \ + python train.py --batch-size=64 + ``` + +### dstack + +`dstack` always uses container. If `image` is not specified, `dstack` uses a base Docker image with `uv`, `python`, essential CUDA drivers, and other dependencies. You can also specify your own Docker image: + +=== "Public registry" + + ```yaml + type: task + name: train-with-image + + image: pytorch/pytorch:2.0.0-cuda11.8-cudnn8-runtime + + repos: + - . + + commands: + - python train.py --batch-size=64 + + resources: + gpu: 1 + memory: 32GB + ``` + +=== "Private registry" + + ```yaml + type: task + name: train-ngc + + image: nvcr.io/nvidia/pytorch:24.01-py3 + + registry_auth: + username: $oauthtoken + password: ${{ secrets.nvidia_ngc_api_key }} + + repos: + - . + + commands: + - python train.py --batch-size=64 + + resources: + gpu: 1 + memory: 32GB + ``` + +`dstack` can automatically upload files via `repos` or `files`, or mount filesystems via `volumes`. See [Filesystems and data access](#filesystems-and-data-access) below. + +## Distributed training + +Both Slurm and `dstack` schedule distributed workloads over clusters with fast interconnect, automatically propagating environment variables required by distributed frameworks (PyTorch DDP, DeepSpeed, FSDP, etc.). + +### Slurm + +Slurm explicitly controls both `nodes` and processes/tasks. + +=== "PyTorch DDP" + + ```bash + #!/bin/bash + #SBATCH --job-name=distributed-train + #SBATCH --nodes=4 + #SBATCH --ntasks-per-node=1 # One task per node + #SBATCH --gres=gpu:8 # 8 GPUs per node + #SBATCH --mem=200G + #SBATCH --time=24:00:00 + #SBATCH --partition=gpu + + # Set up distributed training environment + MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) + MASTER_PORT=12345 + + export MASTER_ADDR MASTER_PORT + + # Launch training with torchrun (torch.distributed.launch is deprecated) + srun torchrun \ + --nnodes="$SLURM_JOB_NUM_NODES" \ + --nproc_per_node=8 \ + --node_rank="$SLURM_NODEID" \ + --rdzv_backend=c10d \ + --rdzv_endpoint="$MASTER_ADDR:$MASTER_PORT" \ + train.py \ + --model llama-7b \ + --batch-size=32 \ + --epochs=10 + ``` + + +=== "MPI" + + ```bash + #!/bin/bash + #SBATCH --nodes=2 + #SBATCH --ntasks=16 + #SBATCH --gres=gpu:8 + #SBATCH --mem=200G + #SBATCH --time=24:00:00 + + export MASTER_ADDR=$(scontrol show hostnames $SLURM_NODELIST | head -n1) + export MASTER_PORT=12345 + + # Convert SLURM_JOB_NODELIST to hostfile format + HOSTFILE=$(mktemp) + scontrol show hostnames $SLURM_JOB_NODELIST | awk -v slots=$SLURM_NTASKS_PER_NODE '{print $0" slots="slots}' > $HOSTFILE + + # MPI with NCCL tests or custom MPI application + mpirun \ + --allow-run-as-root \ + --hostfile $HOSTFILE \ + -n $SLURM_NTASKS \ + --bind-to none \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 + + rm -f $HOSTFILE + ``` + +### dstack + +`dstack` only specifies `nodes`. A run with multiple nodes creates multiple jobs (one per node), each running in a container on a particular instance. Inside the job container, processes are determined by the user's `commands`. + +=== "PyTorch DDP" + + ```yaml + type: task + name: distributed-train-pytorch + + nodes: 4 + + python: 3.12 + repos: + - . + + env: + - NCCL_DEBUG=INFO + - NCCL_IB_DISABLE=0 + - NCCL_SOCKET_IFNAME=eth0 + + commands: + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --node-rank=$DSTACK_NODE_RANK \ + --nnodes=$DSTACK_NODES_NUM \ + --master-addr=$DSTACK_MASTER_NODE_IP \ + --master-port=12345 \ + train.py \ + --model llama-7b \ + --batch-size=32 \ + --epochs=10 + + resources: + gpu: A100:80GB:8 + memory: 200GB.. + shm_size: 24GB + + max_duration: 24h + ``` + +=== "MPI" + + For MPI workloads that require specific job startup and termination behavior, `dstack` provides `startup_order` and `stop_criteria` properties. The master node (rank 0) runs the MPI command, while worker nodes wait for the master to complete. + + ```yaml + type: task + name: nccl-tests + + nodes: 2 + startup_order: workers-first + stop_criteria: master-done + + env: + - NCCL_DEBUG=INFO + + commands: + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 + else + sleep infinity + fi + + resources: + gpu: nvidia:1..8 + shm_size: 16GB + ``` + + If `startup_order` and `stop_criteria` are not configured (as in the PyTorch DDP example above), the master worker starts first and waits until all workers terminate. For MPI workloads, we need to change this. + +#### Nodes and processes comparison + +| | Slurm | dstack | +|---|-------|--------| +| **Nodes** | `--nodes=4` | `nodes: 4` | +| **Processes/tasks** | `--ntasks=8` or `--ntasks-per-node=2` (controls process distribution) | Determined by `commands` (relies on frameworks like `torchrun`, `accelerate`, `mpirun`, etc.) | + +**Environment variables comparison:** + +| Slurm | dstack | Purpose | +|-------|--------|---------| +| `SLURM_NODELIST` | `DSTACK_NODES_IPS` | Newline-delimited list of node IPs | +| `SLURM_NODEID` | `DSTACK_NODE_RANK` | Node rank (0-based) | +| `SLURM_PROCID` | N/A | Process rank (0-based, across all processes) | +| `SLURM_NTASKS` | `DSTACK_GPUS_NUM` | Total number of processes/GPUs | +| `SLURM_NTASKS_PER_NODE` | `DSTACK_GPUS_PER_NODE` | Number of processes/GPUs per node | +| `SLURM_JOB_NUM_NODES` | `DSTACK_NODES_NUM` | Number of nodes | +| Manual master address | `DSTACK_MASTER_NODE_IP` | Master node IP (automatically set) | +| N/A | `DSTACK_MPI_HOSTFILE` | Pre-populated MPI hostfile | + +!!! info "Fleets" + Distributed tasks may run only on a fleet with `placement: cluster` configured. Refer to [Partitions and fleets](#partitions-and-fleets) for configuration details. + +## Queueing and scheduling + +Both systems support core scheduling features and efficient resource utilization. + +| | Slurm | dstack | +|---------|-------|--------| +| **Prioritization** | Multi-factor system (fairshare, age, QOS); influenced via `--qos` or `--partition` flags | Set via `priority` (0-100); plus FIFO within the same priority | +| **Queueing** | Automatic via `sbatch`; managed through partitions | Set `on_events` to `[no-capacity]` under `retry` configuration | +| **Usage quotas** | Set via `sacctmgr` command per user/account/QOS | Not supported | +| **Backfill scheduling** | Enabled via `SchedulerType=sched/backfill` in `slurm.conf` | Not supported | +| **Preemption** | Configured via `PreemptType` in `slurm.conf` (QOS or partition-based) | Not supported | +| **Topology-aware scheduling** | Configured via `topology.conf` (InfiniBand switches, interconnects) | Not supported | + +### Slurm + +Slurm may use a multi-factor priority system, and limit usage across accounts, users, and runs. + +#### QOS + +Quality of Service (QOS) provides a static priority boost. Administrators create QOS levels and assign them to users as defaults: + +
+ +```shell +$ sacctmgr add qos high_priority Priority=1000 +$ sacctmgr modify qos high_priority set MaxWall=200:00:00 MaxTRES=gres/gpu=8 +``` + +
+ +Users can override the default QOS when submitting jobs via CLI (`sbatch --qos=high_priority`) or in the job script: + +
+ +```bash +#!/bin/bash +#SBATCH --qos=high_priority +``` + +
+ +#### Accounts and usage quotas + +Usage quotas limit resource consumption and can be set per user, account, or QOS: + +
+ +```shell +$ sacctmgr add account research +$ sacctmgr modify user user1 set account=research +$ sacctmgr modify user user1 set MaxWall=100:00:00 MaxTRES=gres/gpu=4 +$ sacctmgr modify account research set MaxWall=1000:00:00 MaxTRES=gres/gpu=16 +``` + +
+ +#### Monitoring commands + +Slurm provides several CLI commands to check queue status, job details, and quota usage: + +=== "Queue status" + + Use `squeue` to check queue status. Jobs are listed in scheduling order by priority: + +
+ + ```shell + $ squeue -u $USER + JOBID PARTITION NAME USER ST TIME NODES REASON + 12345 gpu training user1 PD 0:00 2 Priority + ``` + +
+ +=== "Job details" + + Use `scontrol show job` to show detailed information about a specific job: + +
+ + ```shell + $ scontrol show job 12345 + JobId=12345 JobName=training + UserId=user1(1001) GroupId=users(100) + Priority=4294 Reason=Priority (Resources) + ``` + +
+ +=== "Quota usage" + + The `sacct` command can show quota consumption per user, account, or QOS depending on the format options: + +
+ + ```shell + $ sacct -S 2024-01-01 -E 2024-01-31 --format=User,Account,TotalCPU,TotalTRES + User Account TotalCPU TotalTRES + user1 research 100:00:00 gres/gpu=50 + ``` + +
+ +#### Topology-aware scheduling + +Slurm detects network topology (InfiniBand switches, interconnects) and optimizes multi-node job placement to minimize latency. Configured in `topology.conf`, referenced from `slurm.conf`: + +
+ +```bash +SwitchName=switch1 Nodes=node[01-10] +SwitchName=switch2 Nodes=node[11-20] +``` + +
+ +When scheduling multi-node jobs, Slurm prioritizes nodes connected to the same switch to minimize network latency. + +### dstack + +`dstack` doesn't have the concept of accounts, QOS, and doesn't support usage quotas yet. + +#### Priority and retry policy + +However, `dstack` supports prioritization (integer, no multi-factor or pre-emption) and queueing jobs. + +
+ +```yaml +type: task +name: train-with-retry + +python: 3.12 +repos: + - . + +commands: + - python train.py --batch-size=64 + +resources: + gpu: 1 + memory: 32GB + +# Priority: 0-100 (FIFO within same level; default: 0) +priority: 50 + +retry: + on_events: [no-capacity] # Retry until idle instances are available (enables queueing similar to Slurm) + duration: 48h # Maximum retry time (run age for no-capacity, time since last event for error/interruption) + +max_duration: 2h +``` + +
+ +By default, the `retry` policy is not set, which means run fails immediately if no capacity is available. + +#### Scheduled runs + +Unlike Slurm, `dstack` supports scheduled runs using the `schedule` property with cron syntax, allowing tasks to start periodically at specific UTC times. + +
+ +```yaml +type: task +name: task-with-cron + +python: 3.12 +repos: + - . + +commands: + - python task.py --batch-size=64 + +resources: + gpu: 1 + memory: 32GB + +schedule: + cron: "15 23 * * *" # everyday at 23:15 UTC +``` + +
+ +#### Monitoring commands + +=== "Queue status" + The `dstack ps` command displays runs and jobs sorted by priority, reflecting the order in which they will be scheduled. + +
+ + ```shell + $ dstack ps + NAME BACKEND RESOURCES PRICE STATUS SUBMITTED + training-job aws H100:1 (spot) $4.50 provisioning 2 mins ago + ``` + +
+ +#### Topology-aware scheduling + +Topology-aware scheduling is not supported in `dstack`. While backend provisioning may respect network topology (e.g., cloud providers may provision instances with optimal inter-node connectivity), `dstack` task scheduling does not leverage topology-aware placement. + +## Partitions and fleets + +Partitions in Slurm and fleets in `dstack` both organize compute nodes for job scheduling. The key difference is that `dstack` fleets natively support dynamic cloud provisioning, whereas Slurm partitions organize pre-configured static nodes. + +| | Slurm | dstack | +|---|-------|--------| +| **Provisioning** | Static nodes only | Supports both static clusters (SSH fleets) and dynamic provisioning via backends (cloud or Kubernetes) | +| **Overlap** | Nodes can belong to multiple partitions | Each instance belongs to exactly one fleet | +| **Accounts and projects** | Multiple accounts can use the same partition; used for quotas and resource accounting | Each fleet belongs to one project | + +### Slurm + +Slurm partitions are logical groupings of static nodes defined in `slurm.conf`. Nodes can belong to multiple partitions: + +
+ +```bash +PartitionName=gpu Nodes=gpu-node[01-10] Default=NO MaxTime=24:00:00 +PartitionName=cpu Nodes=cpu-node[01-50] Default=YES MaxTime=72:00:00 +PartitionName=debug Nodes=gpu-node[01-10] Default=NO MaxTime=1:00:00 +``` + +
+ +Submit to a specific partition: + +
+ +```shell +$ sbatch --partition=gpu train.sh + Submitted batch job 12346 +``` + +
+ +### dstack + +`dstack` fleets are pools of instances (VMs or containers) that serve as both the organization unit and the provisioning template. + +`dstack` supports two types of fleets: + +| Fleet type | Description | +|------------|-------------| +| **Backend fleets** | Dynamically provisioned via configured backends (cloud or Kubernetes). Specify `resources` and `nodes` range; `dstack apply` provisions matching instances/clusters automatically. | +| **SSH fleets** | Use existing on-premises servers/clusters via `ssh_config`. `dstack apply` connects via SSH, installs dependencies. | + +=== "Backend fleets" + +
+ + ```yaml + type: fleet + name: gpu-fleet + + nodes: 0..8 + + resources: + gpu: A100:80GB:8 + + # Optional: Enables inter-node connectivity; required for distributed tasks + placement: cluster + + # Optional: Split GPUs into blocks for multi-tenant sharing + # Optional: Allows to share the instance across up to 8 workloads + blocks: 8 + + backends: [aws] + + # Spot instances for cost savings + spot_policy: auto + ``` + +
+ +=== "SSH fleets" + +
+ + ```yaml + type: fleet + name: on-prem-gpu-fleet + + # Optional: Enables inter-node connectivity; required for distributed tasks + placement: cluster + + # Optional: Allows to share the instance across up to 8 workloads + blocks: 8 + + ssh_config: + user: dstack + identity_file: ~/.ssh/id_rsa + hosts: + - gpu-node01.example.com + - gpu-node02.example.com + + # Optional: Only required if hosts are behind a login node (bastion host) + proxy_jump: + hostname: login-node.example.com + user: dstack + identity_file: ~/.ssh/login_node_key + ``` + +
+ +Tasks with multiple nodes require a fleet with `placement: cluster` configured, otherwise they cannot run. + +Submit to a specific fleet: + +
+ +```shell +$ dstack apply -f train.dstack.yml --fleet gpu-fleet + BACKEND REGION RESOURCES SPOT PRICE + 1 aws us-east-1 4xCPU, 16GB, T4:1 yes $0.10 + Submit the run train-model? [y/n]: y + Launching `train-model`... + ---> 100% +``` + +
+ +Create or update a fleet: + +
+ +```shell +$ dstack apply -f fleet.dstack.yml + Provisioning... + ---> 100% +``` + +
+ +List fleets: + +
+ +```shell +$ dstack fleet + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + gpu-fleet 0 aws (us-east-1) A100:80GB (spot) $0.50 idle 3 mins ago +``` + +
+ +## Filesystems and data access + +Both Slurm and `dstack` allow workloads to access filesystems (including shared filesystems) and copy files. + +| | Slurm | dstack | +|---|-------|--------| +| **Host filesystem access** | Full access by default (native processes); mounting required only for containers | Always uses containers; requires explicit mounting via `volumes` (instance or network) | +| **Shared filesystems** | Assumes global namespace (NFS, Lustre, GPFS); same path exists on all nodes | Supported via SSH fleets with instance volumes (pre-mounted network storage); network volumes for backend fleets (limited support for shared filesystems) | +| **Instance disk size** | Fixed by cluster administrator | Configurable via `disk` property in `resources` (tasks) or fleet configuration; supports ranges (e.g., `disk: 500GB` or `disk: 200GB..1TB`) | +| **Local/temporary storage** | `$SLURM_TMPDIR` (auto-cleaned on job completion) | Container filesystem (auto-cleaned on job completion; except instance volumes or network volumes) | +| **File transfer** | `sbcast` for broadcasting files to allocated nodes | `repos` and `files` properties; `rsync`/`scp` via SSH (when attached) | + +### Slurm + +Slurm assumes a shared filesystem (NFS, Lustre, GPFS) with a global namespace. The same path exists on all nodes, and `$SLURM_TMPDIR` provides local scratch space that is automatically cleaned. + +=== "Native processes" + +
+ + ```bash + #!/bin/bash + #SBATCH --nodes=4 + #SBATCH --gres=gpu:8 + #SBATCH --time=24:00:00 + + # Global namespace - same path on all nodes + # Dataset accessible at same path on all nodes + DATASET_PATH=/shared/datasets/imagenet + + # Local scratch (faster I/O, auto-cleaned) + # Copy dataset to local SSD for faster access + cp -r $DATASET_PATH $SLURM_TMPDIR/dataset + + # Training with local dataset + python train.py \ + --data=$SLURM_TMPDIR/dataset \ + --checkpoint-dir=/shared/checkpoints \ + --epochs=100 + + # $SLURM_TMPDIR automatically cleaned when job ends + # Checkpoints saved to shared filesystem persist + ``` + +
+ +=== "Containers" + + When using containers, shared filesystems must be explicitly mounted via bind mounts: + +
+ + ```bash + #!/bin/bash + #SBATCH --nodes=4 + #SBATCH --gres=gpu:8 + #SBATCH --time=24:00:00 + + # Shared filesystem mounted at /datasets and /checkpoints + DATASET_PATH=/datasets/imagenet + + # Local scratch accessible via $SLURM_TMPDIR (host storage mounted into container) + # Copy dataset to local scratch, then train + srun --container-image=/shared/images/pytorch-2.0-cuda11.8.sif \ + --container-mounts=/shared/datasets:/datasets,/shared/checkpoints:/checkpoints \ + cp -r $DATASET_PATH $SLURM_TMPDIR/dataset + + srun --container-image=/shared/images/pytorch-2.0-cuda11.8.sif \ + --container-mounts=/shared/datasets:/datasets,/shared/checkpoints:/checkpoints \ + python train.py \ + --data=$SLURM_TMPDIR/dataset \ + --checkpoint-dir=/checkpoints \ + --epochs=100 + + # \$SLURM_TMPDIR automatically cleaned when job ends + # Checkpoints saved to mounted shared filesystem persist + ``` + +
+ +#### File broadcasting (sbcast) + +Slurm provides `sbcast` to distribute files efficiently using its internal network topology, avoiding filesystem contention: + +
+ +```bash +#!/bin/bash +#SBATCH --nodes=4 +#SBATCH --ntasks=32 + +# Broadcast file to all allocated nodes +srun --ntasks=1 --nodes=1 sbcast /shared/data/input.txt /tmp/input.txt + +# Use broadcasted file on all nodes +srun python train.py --input=/tmp/input.txt +``` + +
+ +### dstack + +`dstack` supports both accessing filesystems (including shared filesystems) and uploading/downloading code/data from the client. + +#### Instance volumes + +Instance volumes mount host directories into containers. With distributed tasks, the host can use a shared filesystem (NFS, Lustre, GPFS) to share data across jobs within the same task: + +
+ +```yaml +type: task +name: distributed-train + +nodes: 4 + +python: 3.12 +repos: + - . + +volumes: + # Host directory (can be on shared filesystem) mounted into container + - /mnt/shared/datasets:/data + - /mnt/shared/checkpoints:/checkpoints + +commands: + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --node-rank=$DSTACK_NODE_RANK \ + --nnodes=$DSTACK_NODES_NUM \ + --master-addr=$DSTACK_MASTER_NODE_IP \ + --master-port=12345 \ + train.py \ + --data=/data \ + --checkpoint-dir=/checkpoints + +resources: + gpu: A100:80GB:8 + memory: 200GB +``` + +
+ +#### Network volumes + +Network volumes are persistent cloud storage (AWS EBS, GCP persistent disks, RunPod volumes). + +Single-node task: + +
+ +```yaml +type: task +name: train-model + +python: 3.9 +repos: + - . + +volumes: + - name: imagenet-dataset + path: /data + +commands: + - python train.py --data=/data --batch-size=64 + +resources: + gpu: 1 + memory: 32GB +``` + +
+ +Network volumes cannot be used with distributed tasks (no multi-attach support), except where multi-attach is supported (RunPod) or via volume interpolation. + +For distributed tasks, use interpolation to attach different volumes to each node. + +
+ +```yaml +type: task +name: distributed-train + +nodes: 4 + +python: 3.12 +repos: + - . + +volumes: + # Each node gets its own volume + - name: dataset-${{ dstack.node_rank }} + path: /data + +commands: + - | + torchrun \ + --nproc-per-node=$DSTACK_GPUS_PER_NODE \ + --node-rank=$DSTACK_NODE_RANK \ + --nnodes=$DSTACK_NODES_NUM \ + --master-addr=$DSTACK_MASTER_NODE_IP \ + --master-port=12345 \ + train.py \ + --data=/data + +resources: + gpu: A100:80GB:8 + memory: 200GB +``` + +
+ +Volume name interpolation is not the same as a shared filesystem—each node has its own separate volume. `dstack` currently has limited support for shared filesystems when using backend fleets. + +#### Repos and files + +The `repos` and `files` properties allow uploading code or data into the container. + +=== "Repos" + + The `repos` property clones Git repositories into the container. `dstack` clones the repo on the instance, applies local changes, and mounts it into the container. This is useful for code that needs to be version-controlled and synced. + +
+ + ```yaml + type: task + name: train-model + + python: 3.9 + + repos: + - . # Clone current directory repo + + commands: + - python train.py --batch-size=64 + + resources: + gpu: 1 + memory: 32GB + cpu: 8 + ``` + +
+ +=== "Files" + + The `files` property mounts local files or directories into the container. Each entry maps a local path to a container path. + +
+ + ```yaml + type: task + name: train-model + + python: 3.9 + + files: + - ../configs:~/configs + - ~/.ssh/id_rsa:~/ssh/id_rsa + + commands: + - python train.py --config ~/configs/model.yaml --batch-size=64 + + resources: + gpu: 1 + memory: 32GB + cpu: 8 + ``` + +
+ + Files are uploaded to the instance and mounted into the container, but are not persisted across runs (2MB limit per file, configurable). + +#### SSH file transfer + +While attached to a run, you can transfer files via `rsync` or `scp` using the run name alias: + +=== "rsync" + +
+ + ```shell + $ rsync -avz ./data/ :/path/inside/container/data/ + ``` + +
+ +=== "scp" + +
+ + ```shell + $ scp large-dataset.h5 :/path/inside/container/ + ``` + +
+ +> Uploading code/data from/to the client is not recommended as transfer speed greatly depends on network bandwidth between the CLI and the instance. + +## Interactive development + +Both Slurm and `dstack` allow allocating resources for interactive development. + +| | Slurm | dstack | +|---|-------|--------| +| **Configuration** | Uses `salloc` command to allocate resources with a time limit; resources are automatically released when time expires | Uses `type: dev-environment` configurations as first-class citizen; provisions compute and runs until explicitly stopped (optional inactivity-based termination) | +| **IDE access** | Requires SSH access to allocated nodes | Native access using desktop IDEs (VS Code, Cursor, Windsurf, etc.) or SSH | +| **SSH access** | SSH to allocated nodes (host OS) using `SLURM_NODELIST` or `srun --pty` | SSH automatically configured; access via run name alias (inside container) | + +### Slurm + +Slurm uses `salloc` to allocate resources with a time limit. `salloc` returns a shell on the login node with environment variables set; use `srun` or SSH to access compute nodes. After the time limit expires, resources are automatically released: + +
+ +```shell +$ salloc --nodes=1 --gres=gpu:1 --time=4:00:00 + salloc: Granted job allocation 12346 + +$ srun --pty bash + [user@compute-node-01 ~]$ python train.py --epochs=1 + Training epoch 1... + [user@compute-node-01 ~]$ exit + exit + +$ exit + exit + salloc: Relinquishing job allocation 12346 +``` + +
+ +Alternatively, SSH directly to allocated nodes using hostnames from `SLURM_NODELIST`: + +
+ +```shell +$ ssh $SLURM_NODELIST + [user@compute-node-01 ~]$ +``` + +
+ +### dstack + +`dstack` uses `dev-environment` configuration type that automatically provisions an instance and runs until explicitly stopped, with optional inactivity-based termination. Access is provided via native desktop IDEs (VS Code, Cursor, Windsurf, etc.) or SSH: + +
+ +```yaml +type: dev-environment +name: ml-dev + +python: 3.12 +ide: vscode + +resources: + gpu: A100:80GB:1 + memory: 200GB + +# Optional: Maximum runtime duration (stops after this time) +max_duration: 8h + +# Optional: Auto-stop after period of inactivity (no SSH/IDE connections) +inactivity_duration: 2h + +# Optional: Auto-stop if GPU utilization is below threshold +utilization_policy: + min_gpu_utilization: 10 # Percentage + time_window: 1h +``` + +
+ +Start the dev environment: + +
+ +```shell +$ dstack apply -f dev.dstack.yml + BACKEND REGION RESOURCES SPOT PRICE + 1 runpod CA-MTL-1 9xCPU, 48GB, A5000:24GB yes $0.11 + Submit the run ml-dev? [y/n]: y + Launching `ml-dev`... + ---> 100% + To open in VS Code Desktop, use this link: + vscode://vscode-remote/ssh-remote+ml-dev/workflow +``` + +
+ +#### Port forwarding + +`dstack` tasks support exposing `ports` for running interactive applications like Jupyter notebooks or Streamlit apps: + +=== "Jupyter" + +
+ + ```yaml + type: task + name: jupyter + + python: 3.12 + + commands: + - pip install jupyterlab + - jupyter lab --allow-root + + ports: + - 8888 + + resources: + gpu: 1 + memory: 32GB + ``` + +
+ +=== "Streamlit" + +
+ + ```yaml + type: task + name: streamlit-app + + python: 3.12 + + commands: + - pip install streamlit + - streamlit hello + + ports: + - 8501 + + resources: + gpu: 1 + memory: 32GB + ``` + +
+ +While `dstack apply` is attached, ports are automatically forwarded to `localhost` (e.g., `http://localhost:8888` for Jupyter, `http://localhost:8501` for Streamlit). + +## Job arrays + +### Slurm job arrays + +Slurm provides native job arrays (`--array=1-100`) that create multiple job tasks from a single submission. Job arrays can be specified via CLI argument or in the job script. + +
+ +```shell +$ sbatch --array=1-100 train.sh + Submitted batch job 1001 +``` + +
+ +Each task can use the `$SLURM_ARRAY_TASK_ID` environment variable within the job script to determine its configuration. Output files can use `%A` for the job ID and `%a` for the task ID in `#SBATCH --output` and `--error` directives. + +### dstack + +`dstack` does not support native job arrays. Submit multiple runs programmatically via CLI or API. Pass a custom environment variable (e.g., `TASK_ID`) to identify each run: + +
+ +```shell +$ for i in {1..100}; do + dstack apply -f train.dstack.yml \ + --name "train-array-task-${i}" \ + --env TASK_ID=${i} \ + --detach + done +``` + +
+ + +## Environment variables and secrets + +Both Slurm and `dstack` handle sensitive data (API keys, tokens, passwords) for ML workloads. Slurm uses environment variables or files, while `dstack` provides encrypted secrets management in addition to environment variables. + +### Slurm + +Slurm uses OS-level authentication. Jobs run with the user's UID/GID and inherit the environment from the login node. No built-in secrets management; users manage credentials in their environment or shared files. + +Set environment variables in the shell before submitting (requires `--export=ALL`): + +
+ +```shell +$ export HF_TOKEN=$(cat ~/.hf_token) +$ sbatch --export=ALL train.sh + Submitted batch job 12346 +``` + +
+ +### dstack + +In addition to environment variables (`env`), `dstack` provides a secrets management system with encryption. Secrets are referenced in configuration using `${{ secrets.name }}` syntax. + +Set secrets: + +
+ +```shell +$ dstack secret set huggingface_token +$ dstack secret set wandb_api_key +``` + +
+ +Use secrets in configuration: + +
+ +```yaml +type: task +name: train-with-secrets + +python: 3.12 +repos: + - . + +env: + - HF_TOKEN=${{ secrets.huggingface_token }} + - WANDB_API_KEY=${{ secrets.wandb_api_key }} + +commands: + - pip install huggingface_hub + - huggingface-cli download meta-llama/Llama-2-7b-hf + - wandb login + - python train.py + +resources: + gpu: A100:80GB:8 +``` + +
+ +## Authentication + +### Slurm + +Slurm uses OS-level authentication. Users authenticate via SSH to login nodes using their Unix accounts. Jobs run with the user's UID/GID, ensuring user isolation—users cannot access other users' files or processes. Slurm enforces file permissions based on Unix UID/GID and association limits (MaxJobs, MaxSubmitJobs) configured per user or account. + +### dstack + +`dstack` uses token-based authentication. Users are registered within projects on the server, and each user is issued a token. This token is used for authentication with all CLI and API commands. Access is controlled at the project level with user roles: + +| Role | Permissions | +|------|-------------| +| **Admin** | Can manage project settings, including backends, gateways, and members | +| **Manager** | Can manage project members but cannot configure backends and gateways | +| **User** | Can manage project resources including runs, fleets, and volumes | + +`dstack` manages SSH keys on the server for secure access to runs and instances. User SSH keys are automatically generated and used when attaching to runs via `dstack attach` or `dstack apply`. Project SSH keys are used by the server to establish SSH connections to provisioned instances. + +!!! note "Multi-tenancy isolation" + `dstack` currently does not offer full isolation for multi-tenancy. Users may access global resources within the host. + +## Monitoring and observability + +Both systems provide tools to monitor job/run status, cluster/node status, resource metrics, and logs: + +| | Slurm | dstack | +|---|-------|--------| +| **Job/run status** | `squeue` lists jobs in queue | `dstack ps` lists active runs | +| **Cluster/node status** | `sinfo` shows node availability | `dstack fleet` lists instances | +| **CPU/memory metrics** | `sstat` for running jobs | `dstack metrics` for real-time metrics | +| **GPU metrics** | Requires SSH to nodes, `nvidia-smi` per node | Automatic collection via `nvidia-smi`/`amd-smi`, `dstack metrics` | +| **Job history** | `sacct` for completed jobs | `dstack ps -n NUM` shows run history | +| **Logs** | Written to files (`--output`, `--error`) | Streamed via API, `dstack logs` | + +### Slurm + +Slurm provides command-line tools for monitoring cluster state, jobs, and history. + +Check node status: + +
+ +```shell +$ sinfo + PARTITION AVAIL TIMELIMIT NODES STATE NODELIST + gpu up 1-00:00:00 10 idle gpu-node[01-10] +``` + +
+ +Check job queue: + +
+ +```shell +$ squeue -u $USER + JOBID PARTITION NAME USER ST TIME NODES + 12345 gpu training user1 R 2:30 2 +``` + +
+ +Check job details: + +
+ +```shell +$ scontrol show job 12345 + JobId=12345 JobName=training + UserId=user1(1001) GroupId=users(100) + NumNodes=2 NumCPUs=64 NumTasks=32 + Gres=gpu:8(IDX:0,1,2,3,4,5,6,7) +``` + +
+ +Check resource usage for running jobs (`sstat` only works for running jobs): + +
+ +```shell +$ sstat --job=12345 --format=JobID,MaxRSS,MaxVMSize,CPUUtil + JobID MaxRSS MaxVMSize CPUUtil + 12345.0 2048M 4096M 95.2% +``` + +
+ +Check GPU usage (requires SSH to node): + +
+ +```shell +$ srun --jobid=12345 --pty nvidia-smi + GPU 0: 95% utilization, 72GB/80GB memory +``` + +
+ +Check job history for completed jobs: + +
+ +```shell +$ sacct --job=12345 --format=JobID,Elapsed,MaxRSS,State,ExitCode + JobID Elapsed MaxRSS State ExitCode + 12345 2:30:00 2048M COMPLETED 0:0 +``` + +
+ +View logs (written to files via `--output` and `--error` flags; typically in the submission directory on a shared filesystem): + +
+ +```shell +$ cat slurm-12345.out + Training started... + Epoch 1/10: loss=0.5 +``` + +
+ +If logs are on compute nodes, find the node from `scontrol show job`, then access via `srun --jobid` (running jobs) or SSH (completed jobs): + +
+ +```shell +$ srun --jobid=12345 --nodelist=gpu-node01 --pty bash +$ cat slurm-12345.out +``` + +
+ +### dstack + +`dstack` automatically collects essential metrics (CPU, memory, GPU utilization) using vendor utilities (`nvidia-smi`, `amd-smi`, etc.) and provides real-time monitoring via CLI. + +List runs: + +
+ +```shell +$ dstack ps + NAME BACKEND GPU PRICE STATUS SUBMITTED + training-job aws H100:1 (spot) $4.50 running 5 mins ago +``` + +
+ +List fleets and instances (shows GPU health status): + +
+ +```shell +$ dstack fleet + FLEET INSTANCE BACKEND RESOURCES STATUS PRICE CREATED + my-fleet 0 aws (us-east-1) T4:16GB:1 idle $0.526 11 mins ago + 1 aws (us-east-1) T4:16GB:1 idle (warning) $0.526 11 mins ago +``` + +
+ +Check real-time metrics: + +
+ +```shell +$ dstack metrics training-job + NAME STATUS CPU MEMORY GPU + training-job running 45% 16.27GB/200GB gpu=0 mem=72.48GB/80GB util=95% +``` + +
+ +Stream logs (stored centrally using external storage services like CloudWatch Logs or GCP Logging, accessible via CLI and UI): + +
+ +```shell +$ dstack logs training-job + Training started... + Epoch 1/10: loss=0.5 +``` + +
+ +#### Prometheus integration + +`dstack` exports additional metrics to Prometheus: + +| Metric type | Description | +|-------------|-------------| +| **Fleet metrics** | Instance duration, price, GPU count | +| **Run metrics** | Run counters (total, terminated, failed, done) | +| **Job metrics** | Execution time, cost, CPU/memory/GPU usage | +| **DCGM telemetry** | Temperature, ECC errors, PCIe replay counters, NVLink errors | +| **Server health** | HTTP request metrics | + +To enable Prometheus export, set the `DSTACK_ENABLE_PROMETHEUS_METRICS` environment variable and configure Prometheus to scrape metrics from `/metrics`. + +> GPU health monitoring is covered in the [GPU health monitoring](#gpu-health-monitoring) section below. + +## Fault tolerance, checkpointing, and retry + +Both systems support fault tolerance for long-running training jobs that may be interrupted by hardware failures, spot instance terminations, or other issues: + +| | Slurm | dstack | +|---|-------|--------| +| **Retry** | `--requeue` flag requeues jobs on node failure (hardware crash) or preemption, not application failures (software crashes); all nodes requeued together (all-or-nothing) | `retry` property with `on_events` (`error`, `interruption`) and `duration`; all jobs stopped and run resubmitted if any job fails (all-or-nothing) | +| **Graceful stop** | Grace period with `SIGTERM` before `SIGKILL`; `--signal` sends signal before time limit (e.g., `--signal=B:USR1@300`) | Not supported | +| **Checkpointing** | Application-based; save to shared filesystem | Application-based; save to persistent volumes | +| **Instance health** | `HealthCheckProgram` in `slurm.conf` runs custom scripts (DCGM/RVS); non-zero exit drains node (excludes from new scheduling, running jobs continue) | Automatic GPU health monitoring via DCGM; unhealthy instances excluded from scheduling | + +### Slurm + +Slurm handles three types of failures: system failures (hardware crash), application failures (software crash), and preemption. + +Enable automatic requeue on node failure (not application failures). For distributed jobs, if one node fails, the entire job is requeued (all-or-nothing): + +
+ +```bash +#!/bin/bash +#SBATCH --job-name=train-with-checkpoint +#SBATCH --nodes=4 +#SBATCH --gres=gpu:8 +#SBATCH --time=48:00:00 +#SBATCH --requeue # Requeue on node failure only + +srun python train.py +``` + +
+ +Preempted jobs receive `SIGTERM` during a grace period before `SIGKILL` and are typically requeued automatically. Use `--signal` to send a custom signal before the time limit expires: + +
+ +```bash +#!/bin/bash +#SBATCH --job-name=train-with-checkpoint +#SBATCH --nodes=4 +#SBATCH --gres=gpu:8 +#SBATCH --time=48:00:00 +#SBATCH --signal=B:USR1@300 # Send USR1 5 minutes before time limit + +trap 'python save_checkpoint.py --checkpoint-dir=/shared/checkpoints' USR1 + +if [ -f /shared/checkpoints/latest.pt ]; then + RESUME_FLAG="--resume /shared/checkpoints/latest.pt" +fi + +srun python train.py \ + --checkpoint-dir=/shared/checkpoints \ + $RESUME_FLAG +``` + +
+ +Checkpoints are saved to a shared filesystem. Applications must implement checkpointing logic. + +Custom health checks are configured via `HealthCheckProgram` in `slurm.conf`: + +
+ +```bash +HealthCheckProgram=/shared/scripts/gpu_health_check.sh +``` + +
+ +The health check script should exit with non-zero code to drain the node: + +
+ +```bash +#!/bin/bash +dcgmi diag -r 1 +if [ $? -ne 0 ]; then + exit 1 # Non-zero exit drains node +fi +``` + +
+ +Drained nodes are excluded from new scheduling, but running jobs continue until completion. + +### dstack + +`dstack` handles three types of failures: provisioning failures (`no-capacity`), job failures (`error`), and interruptions (`interruption`). The `error` event is triggered by application failures (non-zero exit code) and instance unreachable issues. The `interruption` event is triggered by spot instance terminations and network/hardware issues. + +By default, runs fail immediately. Enable retry via the `retry` property to handle these events: + +
+ +```yaml +type: task +name: train-with-checkpoint-retry + +nodes: 4 + +python: 3.12 +repos: + - . + +volumes: + # Use instance volumes (host directories) or network volumes (cloud-managed persistent storage) + - name: checkpoint-volume + path: /checkpoints + +commands: + - | + if [ -f /checkpoints/latest.pt ]; then + RESUME_FLAG="--resume /checkpoints/latest.pt" + fi + python train.py \ + --checkpoint-dir=/checkpoints \ + $RESUME_FLAG + +resources: + gpu: A100:80GB:8 + memory: 200GB + +spot_policy: auto + +retry: + on_events: [error, interruption] + duration: 48h +``` + +
+ +For distributed tasks, if any job fails and retry is enabled, all jobs are stopped and the run is resubmitted (all-or-nothing). + +Unlike Slurm, `dstack` does not support graceful shutdown signals. Applications must implement proactive checkpointing (periodic saves) and check for existing checkpoints on startup to resume after retries. + +## GPU health monitoring + +Both systems monitor GPU health to prevent degraded hardware from affecting workloads: + +| | Slurm | dstack | +|---|-------|--------| +| **Health checks** | Custom scripts (DCGM/RVS) via `HealthCheckProgram` in `slurm.conf`; typically active diagnostics (`dcgmi diag`) or passive health watches | Automatic DCGM health watches (passive, continuous monitoring) | +| **Failure handling** | Non-zero exit drains node (excludes from new scheduling, running jobs continue); status: DRAIN/DRAINED | Unhealthy instances excluded from scheduling; status shown in `dstack fleet`: `idle` (healthy), `idle (warning)`, `idle (failure)` | + +### Slurm + +Configure custom health check scripts via `HealthCheckProgram` in `slurm.conf`. Scripts typically use DCGM diagnostics (`dcgmi diag`) for NVIDIA GPUs or RVS for AMD GPUs: + +
+ +```bash +HealthCheckProgram=/shared/scripts/gpu_health_check.sh +``` + +
+ +
+ +```bash +#!/bin/bash +dcgmi diag -r 1 # DCGM diagnostic for NVIDIA GPUs +if [ $? -ne 0 ]; then + exit 1 # Non-zero exit drains node +fi +``` + +
+ +Drained nodes are excluded from new scheduling, but running jobs continue until completion. + +### dstack + +`dstack` automatically monitors GPU health using DCGM background health checks on instances with NVIDIA GPUs. Supported on cloud backends where DCGM is pre-installed automatically (or comes with users' `os_images`) and SSH fleets where DCGM packages (`datacenter-gpu-manager-4-core`, `datacenter-gpu-manager-4-proprietary`, `datacenter-gpu-manager-exporter`) are installed on hosts. + +> AMD GPU health monitoring is not supported yet. + +Health status is displayed in `dstack fleet`: + +
+ +```shell +$ dstack fleet + FLEET INSTANCE BACKEND RESOURCES STATUS PRICE CREATED + my-fleet 0 aws (us-east-1) T4:16GB:1 idle $0.526 11 mins ago + 1 aws (us-east-1) T4:16GB:1 idle (warning) $0.526 11 mins ago + 2 aws (us-east-1) T4:16GB:1 idle (failure) $0.526 11 mins ago +``` + +
+ +Health status: + +| Status | Description | +|--------|-------------| +| `idle` | Healthy, no issues detected | +| `idle (warning)` | Non-fatal issues (e.g., correctable ECC errors); instance still usable | +| `idle (failure)` | Fatal issues (uncorrectable ECC, PCIe failures); instance excluded from scheduling | + +GPU health metrics are also exported to Prometheus (see [Prometheus integration](#prometheus-integration)). + +## Job dependencies + +Job dependencies enable chaining tasks together, ensuring that downstream jobs only run after upstream jobs complete. + +### Slurm dependencies + +Slurm provides native dependency support via `--dependency` flags. Dependencies are managed by Slurm: + +| Dependency type | Description | +|----------------|-------------| +| **`afterok`** | Runs only if the dependency job finishes with Exit Code 0 (success) | +| **`afterany`** | Runs regardless of success or failure (useful for cleanup jobs) | +| **`aftercorr`** | For array jobs, allows corresponding tasks to start as soon as the matching task in the dependency array completes (e.g., Task 1 of Array B starts when Task 1 of Array A finishes, without waiting for the entire Array A) | +| **`singleton`** | Based on job name and user (not job IDs), ensures only one job with the same name runs at a time for that user (useful for serializing access to shared resources) | + +Submit a job that depends on another job completing successfully: + +
+ +```shell +$ JOB_TRAIN=$(sbatch train.sh | awk '{print $4}') + Submitted batch job 1001 + +$ sbatch --dependency=afterok:$JOB_TRAIN evaluate.sh + Submitted batch job 1002 +``` + +
+ +Submit a job with singleton dependency (only one job with this name runs at a time): + +
+ +```shell +$ sbatch --job-name=ModelTraining --dependency=singleton train.sh + Submitted batch job 1004 +``` + +
+ +### dstack { #dstack-workflow-orchestration } + +`dstack` does not support native job dependencies. Use external workflow orchestration tools (Airflow, Prefect, etc.) to implement dependencies. + +=== "Prefect" + + ```python + from prefect import flow, task + import subprocess + + @task + def train_model(): + """Submit training job and wait for completion""" + subprocess.run( + ["dstack", "apply", "-f", "train.dstack.yml", "--name", "train-run"], + check=True # Raises exception if training fails + ) + return "train-run" + + @task + def evaluate_model(run_name): + """Submit evaluation job after training succeeds""" + subprocess.run( + ["dstack", "apply", "-f", "evaluate.dstack.yml", "--name", f"eval-{run_name}"], + check=True + ) + + @flow + def ml_pipeline(): + train_run = train_model() + evaluate_model(train_run) + ``` + +=== "Airflow" + + ```python + from airflow.decorators import dag, task + from datetime import datetime + import subprocess + + @dag(schedule=None, start_date=datetime(2024, 1, 1), catchup=False) + def ml_training_pipeline(): + @task + def train(context): + """Submit training job and wait for completion""" + run_name = f"train-{context['ds']}" + subprocess.run( + ["dstack", "apply", "-f", "train.dstack.yml", "--name", run_name], + check=True # Raises exception if training fails + ) + return run_name + + @task + def evaluate(run_name, context): + """Submit evaluation job after training succeeds""" + eval_name = f"eval-{run_name}" + subprocess.run( + ["dstack", "apply", "-f", "evaluate.dstack.yml", "--name", eval_name], + check=True + ) + + # Define task dependencies - train() completes before evaluate() starts + train_run = train() + evaluate(train_run) + + ml_training_pipeline() + ``` + +## Heterogeneous jobs + +Heterogeneous jobs (het jobs) allow a single job to request different resource configurations for different components (e.g., GPU nodes for training, high-memory CPU nodes for preprocessing). This is an edge case used for coordinated multi-component workflows. + +### Slurm + +Slurm supports heterogeneous jobs via `#SBATCH hetjob` and `--het-group` flags. Each component can specify different resources: + +```bash +#!/bin/bash +#SBATCH --job-name=ml-pipeline +#SBATCH hetjob +#SBATCH --het-group=0 --nodes=2 --gres=gpu:8 --mem=200G +#SBATCH --het-group=1 --nodes=1 --mem=500G --partition=highmem + +# Use SLURM_JOB_COMPONENT_ID to identify the component +if [ "$SLURM_JOB_COMPONENT_ID" -eq 0 ]; then + srun python train.py +elif [ "$SLURM_JOB_COMPONENT_ID" -eq 1 ]; then + srun python preprocess.py +fi +``` + +### dstack + +`dstack` does not support heterogeneous jobs natively. Use separate runs with [workflow orchestration tools (Prefect, Airflow)](#dstack-workflow-orchestration) or submit multiple runs programmatically to coordinate components with different resource requirements. + +## What's next? + +1. Check out [Quickstart](../../quickstart.md) +2. Read about [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md) +3. Browse the [examples](../../../examples.md) \ No newline at end of file diff --git a/docs/docs/guides/migration.md b/docs/docs/guides/upgrade.md similarity index 99% rename from docs/docs/guides/migration.md rename to docs/docs/guides/upgrade.md index 3ca019fbb5..aacf473fd8 100644 --- a/docs/docs/guides/migration.md +++ b/docs/docs/guides/upgrade.md @@ -1,4 +1,4 @@ -# Migration guide +# Upgrade guide diff --git a/docs/layouts/custom.yml b/docs/layouts/custom.yml index 0ab859b854..74a0637b2d 100644 --- a/docs/layouts/custom.yml +++ b/docs/layouts/custom.yml @@ -50,12 +50,12 @@ size: { width: 1200, height: 630 } layers: - background: color: "black" - - size: { width: 50, height: 50 } - offset: { x: 935, y: 521 } + - size: { width: 65, height: 60 } + offset: { x: 908, y: 499 } background: image: *logo - - size: { width: 340, height: 55 } - offset: { x: 993, y: 521 } + - size: { width: 360, height: 59 } + offset: { x: 975, y: 502 } typography: content: *site_name color: "white" @@ -69,15 +69,15 @@ layers: line: amount: 3 height: 1.25 - # - size: { width: 850, height: 64 } - # offset: { x: 80, y: 495 } - # typography: - # content: *page_description - # align: start - # color: "white" - # line: - # amount: 2 - # height: 1.5 + - size: { width: 870, height: 64 } + offset: { x: 80, y: 498 } + typography: + content: *page_description + align: start + color: "white" + line: + amount: 2 + height: 1.5 tags: diff --git a/docs/overrides/home.html b/docs/overrides/home.html index d693c9d015..ced53fb1e8 100644 --- a/docs/overrides/home.html +++ b/docs/overrides/home.html @@ -455,22 +455,14 @@

FAQ

- dstack fully replaces Slurm. Its - tasks cover job submission, queuing, retries, GPU - health checks, and scheduling for single-node and distributed runs. + Slurm is a battle-tested system with decades of production use in HPC environments. + dstack by contrast, is built for modern ML/AI workloads with cloud-native provisioning and a container-first architecture. + While both support distributed training and batch jobs, dstack + also natively supports development and production-grade inference.

- Beyond job scheduling, dstack adds - dev environments for interactive work, - services for production endpoints, and - fleets that give fine-grained control over - cluster provisioning and placement. -

- -

- You get one platform for development, training, and deployment across cloud, Kubernetes, and - on-prem. + See the migration guide for a detailed comparison.

diff --git a/mkdocs.yml b/mkdocs.yml index 74939703e3..07eed5f3b7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -3,7 +3,7 @@ site_name: dstack site_url: https://dstack.ai site_author: dstack GmbH site_description: >- - dstack is an open-source control plane for running development, training, and inference jobs on GPUs - across hyperscalers, neoclouds, or on-prem. + dstack is an open-source control plane for GPU provisioning and orchestration across GPU clouds, Kubernetes, and on-prem clusters. # Repository repo_url: https://github.com/dstackai/dstack @@ -170,6 +170,7 @@ plugins: "examples/clusters/a3mega/index.md": "examples/clusters/gcp/index.md" "examples/clusters/a4/index.md": "examples/clusters/gcp/index.md" "examples/clusters/efa/index.md": "examples/clusters/aws/index.md" + "docs/guides/migration.md": "docs/guides/upgrade.md" - typeset - gen-files: scripts: # always relative to mkdocs.yml @@ -277,7 +278,9 @@ nav: - Troubleshooting: docs/guides/troubleshooting.md - Metrics: docs/guides/metrics.md - Protips: docs/guides/protips.md - - Migration: docs/guides/migration.md + - Upgrade: docs/guides/upgrade.md + - Migration: + - Slurm: docs/guides/migration/slurm.md - Reference: - .dstack.yml: - dev-environment: docs/reference/dstack.yml/dev-environment.md From 22296d6e34bf1f2400e5374d421a849333ca61c4 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Tue, 13 Jan 2026 14:12:03 +0100 Subject: [PATCH 046/187] Linter fix --- docs/docs/guides/migration/slurm.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/guides/migration/slurm.md b/docs/docs/guides/migration/slurm.md index 82c1548a4b..d006497399 100644 --- a/docs/docs/guides/migration/slurm.md +++ b/docs/docs/guides/migration/slurm.md @@ -1847,4 +1847,4 @@ fi 1. Check out [Quickstart](../../quickstart.md) 2. Read about [dev environments](../../concepts/dev-environments.md), [tasks](../../concepts/tasks.md), and [services](../../concepts/services.md) -3. Browse the [examples](../../../examples.md) \ No newline at end of file +3. Browse the [examples](../../../examples.md) From a36577f97560a1d7067c5fa9370da188202c7f7e Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Wed, 14 Jan 2026 17:41:52 +0000 Subject: [PATCH 047/187] [Internal]: Handle GitHub API errors in `release_notes.py` (#3463) This improves the script error message if the GitHub API call is not successful (e.g., if the token is expired). Before: ``` TypeError: string indices must be integers, not 'str' ``` After: ``` Exception: Error getting GitHub releases; status: 401, body: { "message": "Bad credentials", "documentation_url": "https://docs.github.com/rest", "status": "401" } ``` --- scripts/release_notes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/release_notes.py b/scripts/release_notes.py index bcc659c462..ab2da2d210 100644 --- a/scripts/release_notes.py +++ b/scripts/release_notes.py @@ -32,6 +32,9 @@ def get_draft_release_by_tag(tag: str) -> dict: headers={"Authorization": f"token {GITHUB_TOKEN}"}, timeout=10, ) + if not r.ok: + msg = f"Error getting GitHub releases; status: {r.status_code}, body: {r.text}" + raise Exception(msg) for release in r.json(): if release["tag_name"] == tag and release["draft"]: return release From c90cdf10d3871528c5aaa6f7bb9081cf188957e2 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Thu, 15 Jan 2026 08:27:59 +0000 Subject: [PATCH 048/187] Display `InstanceAvailability.NO_BALANCE` in CLI (#3460) In apply plans and `dstack offer`, display the `NO_BALANCE` availability as `no balance` rather than an empty string. Small related changes: - Refactor availability formatting so that it is consistent across run plans, fleet plans, and `dstack offer`. In fleet plans, availabilities are now displayed in lower case (previously, this was the only place where they were capitalized). - In `dstack offer --group-by gpu`, if a GPU is unavailable due to more than one reason, display all those reasons (previously, only one of the availabilities was displayed). - Default to dispalying unknown availabilities rather that falling back to an empty string. This will allow new availability types added in the future to automatically become visible in the CLI. --- frontend/src/pages/Offers/List/index.tsx | 4 ++++ .../_internal/cli/services/configurators/fleet.py | 11 +++-------- src/dstack/_internal/cli/utils/common.py | 7 +++++++ src/dstack/_internal/cli/utils/gpu.py | 13 +++++-------- src/dstack/_internal/cli/utils/run.py | 12 ++---------- src/dstack/_internal/core/models/instances.py | 4 +--- 6 files changed, 22 insertions(+), 29 deletions(-) diff --git a/frontend/src/pages/Offers/List/index.tsx b/frontend/src/pages/Offers/List/index.tsx index f782a7fb42..edf747d251 100644 --- a/frontend/src/pages/Offers/List/index.tsx +++ b/frontend/src/pages/Offers/List/index.tsx @@ -181,6 +181,10 @@ export const OfferList: React.FC = ({ withSearchParams, onChange { id: 'availability', content: (gpu: IGpu) => { + // FIXME: array to string comparison never passes. + // Additionally, there are more availability statuses that are worth displaying, + // and several of them may be present at once. + // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-expect-error if (gpu.availability === 'not_available') { diff --git a/src/dstack/_internal/cli/services/configurators/fleet.py b/src/dstack/_internal/cli/services/configurators/fleet.py index 89278feb94..27b607cb4a 100644 --- a/src/dstack/_internal/cli/services/configurators/fleet.py +++ b/src/dstack/_internal/cli/services/configurators/fleet.py @@ -14,6 +14,7 @@ NO_OFFERS_WARNING, confirm_ask, console, + format_instance_availability, ) from dstack._internal.cli.utils.fleet import get_fleets_table from dstack._internal.cli.utils.rich import MultiItemStatus @@ -32,7 +33,7 @@ FleetSpec, InstanceGroupPlacement, ) -from dstack._internal.core.models.instances import InstanceAvailability, InstanceStatus, SSHKey +from dstack._internal.core.models.instances import InstanceStatus, SSHKey from dstack._internal.core.services.diff import diff_models from dstack._internal.utils.common import local_time from dstack._internal.utils.logging import get_logger @@ -420,12 +421,6 @@ def th(s: str) -> str: for index, offer in enumerate(print_offers, start=1): resources = offer.instance.resources - availability = "" - if offer.availability in { - InstanceAvailability.NOT_AVAILABLE, - InstanceAvailability.NO_QUOTA, - }: - availability = offer.availability.value.replace("_", " ").title() offers_table.add_row( f"{index}", offer.backend.replace("remote", "ssh"), @@ -434,7 +429,7 @@ def th(s: str) -> str: resources.pretty_format(), "yes" if resources.spot else "no", f"${offer.price:3f}".rstrip("0").rstrip("."), - availability, + format_instance_availability(offer.availability), style=None if index == 1 else "secondary", ) if len(plan.offers) > offers_limit: diff --git a/src/dstack/_internal/cli/utils/common.py b/src/dstack/_internal/cli/utils/common.py index c5b185a4b1..d53b84567b 100644 --- a/src/dstack/_internal/cli/utils/common.py +++ b/src/dstack/_internal/cli/utils/common.py @@ -12,6 +12,7 @@ from dstack._internal import settings from dstack._internal.cli.utils.rich import DstackRichHandler from dstack._internal.core.errors import CLIError, DstackError +from dstack._internal.core.models.instances import InstanceAvailability from dstack._internal.utils.common import get_dstack_dir, parse_since _colors = { @@ -146,3 +147,9 @@ def resolve_url(url: str, timeout: float = 5.0) -> str: except requests.exceptions.ConnectionError as e: raise ValueError(f"Failed to resolve url {url}") from e return response.url + + +def format_instance_availability(v: InstanceAvailability) -> str: + if v in (InstanceAvailability.UNKNOWN, InstanceAvailability.AVAILABLE): + return "" + return v.value.replace("_", " ").lower() diff --git a/src/dstack/_internal/cli/utils/gpu.py b/src/dstack/_internal/cli/utils/gpu.py index 89638cb62f..3d19b173ba 100644 --- a/src/dstack/_internal/cli/utils/gpu.py +++ b/src/dstack/_internal/cli/utils/gpu.py @@ -4,7 +4,7 @@ from rich.table import Table from dstack._internal.cli.models.offers import OfferCommandGroupByGpuOutput, OfferRequirements -from dstack._internal.cli.utils.common import console +from dstack._internal.cli.utils.common import console, format_instance_availability from dstack._internal.core.models.gpus import GpuGroup from dstack._internal.core.models.profiles import SpotPolicy from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_map @@ -117,13 +117,10 @@ def print_gpu_table(gpus: List[GpuGroup], run_spec: RunSpec, group_by: List[str] availability = "" has_available = any(av.is_available() for av in gpu_group.availability) - has_unavailable = any(not av.is_available() for av in gpu_group.availability) - - if has_unavailable and not has_available: - for av in gpu_group.availability: - if av.value in {"not_available", "no_quota", "idle", "busy"}: - availability = av.value.replace("_", " ").lower() - break + if not has_available: + availability = ", ".join( + map(format_instance_availability, set(gpu_group.availability)) + ) secondary_style = "grey58" row_data = [ diff --git a/src/dstack/_internal/cli/utils/run.py b/src/dstack/_internal/cli/utils/run.py index 1b6dfbaeda..dec354e984 100644 --- a/src/dstack/_internal/cli/utils/run.py +++ b/src/dstack/_internal/cli/utils/run.py @@ -11,11 +11,11 @@ NO_OFFERS_WARNING, add_row_from_dict, console, + format_instance_availability, ) from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.configurations import DevEnvironmentConfiguration from dstack._internal.core.models.instances import ( - InstanceAvailability, InstanceOfferWithAvailability, InstanceType, ) @@ -168,14 +168,6 @@ def th(s: str) -> str: for i, offer in enumerate(job_plan.offers, start=1): r = offer.instance.resources - availability = "" - if offer.availability in { - InstanceAvailability.NOT_AVAILABLE, - InstanceAvailability.NO_QUOTA, - InstanceAvailability.IDLE, - InstanceAvailability.BUSY, - }: - availability = offer.availability.value.replace("_", " ").lower() instance = offer.instance.name if offer.total_blocks > 1: instance += f" ({offer.blocks}/{offer.total_blocks})" @@ -185,7 +177,7 @@ def th(s: str) -> str: r.pretty_format(include_spot=True), instance, f"${offer.price:.4f}".rstrip("0").rstrip("."), - availability, + format_instance_availability(offer.availability), style=None if i == 1 or not include_run_properties else "secondary", ) if job_plan.total_offers > len(job_plan.offers): diff --git a/src/dstack/_internal/core/models/instances.py b/src/dstack/_internal/core/models/instances.py index 2bc0c1f898..bf1696758d 100644 --- a/src/dstack/_internal/core/models/instances.py +++ b/src/dstack/_internal/core/models/instances.py @@ -205,9 +205,7 @@ class InstanceAvailability(Enum): AVAILABLE = "available" NOT_AVAILABLE = "not_available" NO_QUOTA = "no_quota" - NO_BALANCE = ( - "no_balance" # Introduced in 0.19.24, may be used after a short compatibility period - ) + NO_BALANCE = "no_balance" # For dstack Sky IDLE = "idle" BUSY = "busy" From 4432cdfe8043b8369716a8108b3ae049920bd4b5 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Thu, 15 Jan 2026 08:28:44 +0000 Subject: [PATCH 049/187] Do not return `NO_BALANCE` to older clients (#3462) Since only newer CLIs can correctly display `InstanceAvailability.NO_BALANCE`, replace `NO_BALANCE` with `NOT_AVAILABLE` in server responses for older clients for the following API methods: - `/api/project/{project_name}/fleets/get_plan` - `/api/project/{project_name}/runs/get_plan` - `/api/project/{project_name}/gpus/list` Additionally, refactor the code to make it easy to retrieve the client version using FastAPI dependencies. ```python client_version: Annotated[Optional[Version], Depends(get_client_version)] ``` --- src/dstack/_internal/server/app.py | 55 ++++++------- .../server/compatibility/__init__.py | 0 .../_internal/server/compatibility/common.py | 20 +++++ .../_internal/server/compatibility/gpus.py | 22 +++++ src/dstack/_internal/server/routers/fleets.py | 7 +- src/dstack/_internal/server/routers/gpus.py | 14 +++- src/dstack/_internal/server/routers/runs.py | 17 ++-- src/dstack/_internal/server/utils/routers.py | 37 ++++----- .../_internal/server/routers/test_fleets.py | 63 +++++++++++++++ .../_internal/server/routers/test_gpus.py | 47 ++++++++++- .../_internal/server/routers/test_runs.py | 73 +++++++++++++++++ src/tests/_internal/server/test_app.py | 80 +++++++++++++++++++ .../_internal/server/utils/test_routers.py | 68 ++++++---------- 13 files changed, 399 insertions(+), 104 deletions(-) create mode 100644 src/dstack/_internal/server/compatibility/__init__.py create mode 100644 src/dstack/_internal/server/compatibility/common.py create mode 100644 src/dstack/_internal/server/compatibility/gpus.py diff --git a/src/dstack/_internal/server/app.py b/src/dstack/_internal/server/app.py index 488a5a9e0e..b41152c149 100644 --- a/src/dstack/_internal/server/app.py +++ b/src/dstack/_internal/server/app.py @@ -5,16 +5,18 @@ from concurrent.futures import ThreadPoolExecutor from contextlib import asynccontextmanager from pathlib import Path -from typing import Awaitable, Callable, List, Optional +from typing import Annotated, Awaitable, Callable, List, Optional import sentry_sdk -from fastapi import FastAPI, Request, Response, status +from fastapi import Depends, FastAPI, Request, Response, status from fastapi.datastructures import URL from fastapi.responses import HTMLResponse, RedirectResponse from fastapi.staticfiles import StaticFiles +from packaging.version import Version from prometheus_client import Counter, Histogram from sentry_sdk.types import SamplingContext +from dstack._internal import settings as core_settings from dstack._internal.cli.utils.common import console from dstack._internal.core.errors import ForbiddenError, ServerClientError from dstack._internal.core.services.configs import update_default_project @@ -68,7 +70,6 @@ get_client_version, get_server_client_error_details, ) -from dstack._internal.settings import DSTACK_VERSION from dstack._internal.utils.logging import get_logger from dstack._internal.utils.ssh import check_required_ssh_version @@ -91,6 +92,9 @@ def create_app() -> FastAPI: app = FastAPI( docs_url="/api/docs", lifespan=lifespan, + dependencies=[ + Depends(_check_client_version), + ], ) app.state.proxy_dependency_injector = ServerProxyDependencyInjector() return app @@ -102,7 +106,7 @@ async def lifespan(app: FastAPI): if settings.SENTRY_DSN is not None: sentry_sdk.init( dsn=settings.SENTRY_DSN, - release=DSTACK_VERSION, + release=core_settings.DSTACK_VERSION, environment=settings.SERVER_ENVIRONMENT, enable_tracing=True, traces_sampler=_sentry_traces_sampler, @@ -164,7 +168,9 @@ async def lifespan(app: FastAPI): else: logger.info("Background processing is disabled") PROBES_SCHEDULER.start() - dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)" + dstack_version = ( + core_settings.DSTACK_VERSION if core_settings.DSTACK_VERSION else "(no version)" + ) job_network_mode_log = ( logger.info if settings.JOB_NETWORK_MODE != settings.DEFAULT_JOB_NETWORK_MODE @@ -336,32 +342,6 @@ def _extract_endpoint_label(request: Request, response: Response) -> str: ).inc() return response - @app.middleware("http") - async def check_client_version(request: Request, call_next): - if ( - not request.url.path.startswith("/api/") - or request.url.path in _NO_API_VERSION_CHECK_ROUTES - ): - return await call_next(request) - try: - client_version = get_client_version(request) - except ValueError as e: - return CustomORJSONResponse( - status_code=status.HTTP_400_BAD_REQUEST, - content={"detail": [error_detail(str(e))]}, - ) - client_release: Optional[tuple[int, ...]] = None - if client_version is not None: - client_release = client_version.release - request.state.client_release = client_release - response = check_client_server_compatibility( - client_version=client_version, - server_version=DSTACK_VERSION, - ) - if response is not None: - return response - return await call_next(request) - @app.get("/healthcheck") async def healthcheck(): return CustomORJSONResponse(content={"status": "running"}) @@ -396,6 +376,19 @@ async def index(): return RedirectResponse("/api/docs") +def _check_client_version( + request: Request, client_version: Annotated[Optional[Version], Depends(get_client_version)] +) -> None: + if ( + request.url.path.startswith("/api/") + and request.url.path not in _NO_API_VERSION_CHECK_ROUTES + ): + check_client_server_compatibility( + client_version=client_version, + server_version=core_settings.DSTACK_VERSION, + ) + + def _is_proxy_request(request: Request) -> bool: if request.url.path.startswith("/proxy"): return True diff --git a/src/dstack/_internal/server/compatibility/__init__.py b/src/dstack/_internal/server/compatibility/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/server/compatibility/common.py b/src/dstack/_internal/server/compatibility/common.py new file mode 100644 index 0000000000..227b45fdaf --- /dev/null +++ b/src/dstack/_internal/server/compatibility/common.py @@ -0,0 +1,20 @@ +from typing import Optional + +from packaging.version import Version + +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceOfferWithAvailability, +) + + +def patch_offers_list( + offers: list[InstanceOfferWithAvailability], client_version: Optional[Version] +) -> None: + if client_version is None: + return + # CLIs prior to 0.20.4 incorrectly display the `no_balance` availability in the run/fleet plan + if client_version < Version("0.20.4"): + for offer in offers: + if offer.availability == InstanceAvailability.NO_BALANCE: + offer.availability = InstanceAvailability.NOT_AVAILABLE diff --git a/src/dstack/_internal/server/compatibility/gpus.py b/src/dstack/_internal/server/compatibility/gpus.py new file mode 100644 index 0000000000..8548e58bf9 --- /dev/null +++ b/src/dstack/_internal/server/compatibility/gpus.py @@ -0,0 +1,22 @@ +from typing import Optional + +from packaging.version import Version + +from dstack._internal.core.models.instances import InstanceAvailability +from dstack._internal.server.schemas.gpus import ListGpusResponse + + +def patch_list_gpus_response( + response: ListGpusResponse, client_version: Optional[Version] +) -> None: + if client_version is None: + return + # CLIs prior to 0.20.4 incorrectly display the `no_balance` availability in `dstack offer --group-by gpu` + if client_version < Version("0.20.4"): + for gpu in response.gpus: + if InstanceAvailability.NO_BALANCE in gpu.availability: + gpu.availability = [ + a for a in gpu.availability if a != InstanceAvailability.NO_BALANCE + ] + if InstanceAvailability.NOT_AVAILABLE not in gpu.availability: + gpu.availability.append(InstanceAvailability.NOT_AVAILABLE) diff --git a/src/dstack/_internal/server/routers/fleets.py b/src/dstack/_internal/server/routers/fleets.py index 7e7126f4bf..d423134675 100644 --- a/src/dstack/_internal/server/routers/fleets.py +++ b/src/dstack/_internal/server/routers/fleets.py @@ -1,11 +1,13 @@ -from typing import List, Tuple +from typing import List, Optional, Tuple from fastapi import APIRouter, Depends +from packaging.version import Version from sqlalchemy.ext.asyncio import AsyncSession import dstack._internal.server.services.fleets as fleets_services from dstack._internal.core.errors import ResourceNotExistsError from dstack._internal.core.models.fleets import Fleet, FleetPlan +from dstack._internal.server.compatibility.common import patch_offers_list from dstack._internal.server.db import get_session from dstack._internal.server.models import ProjectModel, UserModel from dstack._internal.server.schemas.fleets import ( @@ -21,6 +23,7 @@ from dstack._internal.server.utils.routers import ( CustomORJSONResponse, get_base_api_additional_responses, + get_client_version, ) root_router = APIRouter( @@ -101,6 +104,7 @@ async def get_plan( body: GetFleetPlanRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), + client_version: Optional[Version] = Depends(get_client_version), ): """ Returns a fleet plan for the given fleet configuration. @@ -112,6 +116,7 @@ async def get_plan( user=user, spec=body.spec, ) + patch_offers_list(plan.offers, client_version) return CustomORJSONResponse(plan) diff --git a/src/dstack/_internal/server/routers/gpus.py b/src/dstack/_internal/server/routers/gpus.py index 45f0e8bf1f..3a701fb1e8 100644 --- a/src/dstack/_internal/server/routers/gpus.py +++ b/src/dstack/_internal/server/routers/gpus.py @@ -1,12 +1,17 @@ -from typing import Tuple +from typing import Annotated, Optional, Tuple from fastapi import APIRouter, Depends +from packaging.version import Version +from dstack._internal.server.compatibility.gpus import patch_list_gpus_response from dstack._internal.server.models import ProjectModel, UserModel from dstack._internal.server.schemas.gpus import ListGpusRequest, ListGpusResponse from dstack._internal.server.security.permissions import ProjectMember from dstack._internal.server.services.gpus import list_gpus_grouped -from dstack._internal.server.utils.routers import get_base_api_additional_responses +from dstack._internal.server.utils.routers import ( + get_base_api_additional_responses, + get_client_version, +) project_router = APIRouter( prefix="/api/project/{project_name}/gpus", @@ -18,7 +23,10 @@ @project_router.post("/list", response_model=ListGpusResponse, response_model_exclude_none=True) async def list_gpus( body: ListGpusRequest, + client_version: Annotated[Optional[Version], Depends(get_client_version)], user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), ) -> ListGpusResponse: _, project = user_project - return await list_gpus_grouped(project=project, run_spec=body.run_spec, group_by=body.group_by) + resp = await list_gpus_grouped(project=project, run_spec=body.run_spec, group_by=body.group_by) + patch_list_gpus_response(resp, client_version) + return resp diff --git a/src/dstack/_internal/server/routers/runs.py b/src/dstack/_internal/server/routers/runs.py index a4a09b3fb8..27d378d8ba 100644 --- a/src/dstack/_internal/server/routers/runs.py +++ b/src/dstack/_internal/server/routers/runs.py @@ -1,10 +1,12 @@ -from typing import Annotated, List, Optional, Tuple, cast +from typing import Annotated, List, Optional, Tuple -from fastapi import APIRouter, Depends, Request +from fastapi import APIRouter, Depends +from packaging.version import Version from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.core.errors import ResourceNotExistsError from dstack._internal.core.models.runs import Run, RunPlan +from dstack._internal.server.compatibility.common import patch_offers_list from dstack._internal.server.db import get_session from dstack._internal.server.models import ProjectModel, UserModel from dstack._internal.server.schemas.runs import ( @@ -21,6 +23,7 @@ from dstack._internal.server.utils.routers import ( CustomORJSONResponse, get_base_api_additional_responses, + get_client_version, ) root_router = APIRouter( @@ -35,9 +38,10 @@ ) -def use_legacy_repo_dir(request: Request) -> bool: - client_release = cast(Optional[tuple[int, ...]], request.state.client_release) - return client_release is not None and client_release < (0, 19, 27) +def use_legacy_repo_dir( + client_version: Annotated[Optional[Version], Depends(get_client_version)], +) -> bool: + return client_version is not None and client_version < Version("0.19.27") @root_router.post( @@ -110,6 +114,7 @@ async def get_plan( body: GetRunPlanRequest, session: Annotated[AsyncSession, Depends(get_session)], user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectMember())], + client_version: Annotated[Optional[Version], Depends(get_client_version)], legacy_repo_dir: Annotated[bool, Depends(use_legacy_repo_dir)], ): """ @@ -127,6 +132,8 @@ async def get_plan( max_offers=body.max_offers, legacy_repo_dir=legacy_repo_dir, ) + for job_plan in run_plan.job_plans: + patch_offers_list(job_plan.offers, client_version) return CustomORJSONResponse(run_plan) diff --git a/src/dstack/_internal/server/utils/routers.py b/src/dstack/_internal/server/utils/routers.py index a625ccd9a2..5aff751868 100644 --- a/src/dstack/_internal/server/utils/routers.py +++ b/src/dstack/_internal/server/utils/routers.py @@ -124,19 +124,28 @@ def get_request_size(request: Request) -> int: def get_client_version(request: Request) -> Optional[packaging.version.Version]: + """ + FastAPI dependency that returns the dstack client version or None if the version is latest/dev. + """ + version = request.headers.get("x-api-version") if version is None: return None - return parse_version(version) + try: + return parse_version(version) + except ValueError as e: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=[error_detail(str(e))], + ) def check_client_server_compatibility( client_version: Optional[packaging.version.Version], server_version: Optional[str], -) -> Optional[CustomORJSONResponse]: +) -> None: """ - Returns `JSONResponse` with error if client/server versions are incompatible. - Returns `None` otherwise. + Raise HTTP exception if the client is incompatible with the server. """ if client_version is None or server_version is None: return None @@ -149,21 +158,9 @@ def check_client_server_compatibility( client_version.major > parsed_server_version.major or client_version.minor > parsed_server_version.minor ): - return error_incompatible_versions( - str(client_version), server_version, ask_cli_update=False + msg = f"The client/CLI version ({client_version}) is incompatible with the server version ({server_version})." + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=get_server_client_error_details(ServerClientError(msg=msg)), ) return None - - -def error_incompatible_versions( - client_version: Optional[str], - server_version: str, - ask_cli_update: bool, -) -> CustomORJSONResponse: - msg = f"The client/CLI version ({client_version}) is incompatible with the server version ({server_version})." - if ask_cli_update: - msg += f" Update the dstack CLI: `pip install dstack=={server_version}`." - return CustomORJSONResponse( - status_code=status.HTTP_400_BAD_REQUEST, - content={"detail": get_server_client_error_details(ServerClientError(msg=msg))}, - ) diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py index 12e439111e..afa68b788d 100644 --- a/src/tests/_internal/server/routers/test_fleets.py +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -1,5 +1,6 @@ import json from datetime import datetime, timezone +from typing import Optional from unittest.mock import Mock, patch from uuid import UUID, uuid4 @@ -1167,6 +1168,68 @@ async def test_returns_create_plan_for_existing_fleet( "action": "create", } + @pytest.mark.parametrize( + ("client_version", "expected_availability"), + [ + ("0.20.3", InstanceAvailability.NOT_AVAILABLE), + ("0.20.4", InstanceAvailability.NO_BALANCE), + (None, InstanceAvailability.NO_BALANCE), + ], + ) + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_replaces_no_balance_with_not_available_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_availability: InstanceAvailability, + ): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + offers = [ + InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance-1", + resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ), + InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance-2", + resources=Resources(cpus=2, memory_mib=1024, spot=False, gpus=[]), + ), + region="us", + price=2.0, + availability=InstanceAvailability.NO_BALANCE, + ), + ] + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value.get_offers.return_value = offers + response = await client.post( + f"/api/project/{project.name}/fleets/get_plan", + headers=headers, + json={"spec": get_fleet_spec().dict()}, + ) + + assert response.status_code == 200 + offers = response.json()["offers"] + assert len(offers) == 2 + assert offers[0]["availability"] == InstanceAvailability.AVAILABLE.value + assert offers[1]["availability"] == expected_availability.value + def _fleet_model_to_json_dict(fleet: FleetModel) -> dict: return json.loads(fleet_model_to_fleet(fleet).json()) diff --git a/src/tests/_internal/server/routers/test_gpus.py b/src/tests/_internal/server/routers/test_gpus.py index d07a92bb2f..32c862231a 100644 --- a/src/tests/_internal/server/routers/test_gpus.py +++ b/src/tests/_internal/server/routers/test_gpus.py @@ -96,15 +96,19 @@ async def call_gpus_api( user_token: str, run_spec: RunSpec, group_by: Optional[List[str]] = None, + client_version: Optional[str] = None, ): """Helper to call the GPUs API with standard parameters.""" json_data = {"run_spec": run_spec.dict()} if group_by is not None: json_data["group_by"] = group_by + headers = get_auth_headers(user_token) + if client_version is not None: + headers["X-API-Version"] = client_version return await client.post( f"/api/project/{project_name}/gpus/list", - headers=get_auth_headers(user_token), + headers=headers, json=json_data, ) @@ -511,3 +515,44 @@ async def test_exact_aggregation_values( assert rtx_runpod_euwest1["region"] == "eu-west-1" assert rtx_runpod_euwest1["price"]["min"] == 0.65 assert rtx_runpod_euwest1["price"]["max"] == 0.65 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + ("client_version", "expected_availability"), + [ + ("0.20.3", InstanceAvailability.NOT_AVAILABLE), + ("0.20.4", InstanceAvailability.NO_BALANCE), + (None, InstanceAvailability.NO_BALANCE), + ], + ) + async def test_replaces_no_balance_with_not_available_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_availability: InstanceAvailability, + ): + user, project, repo, run_spec = await gpu_test_setup(session) + + available_offer = create_gpu_offer( + BackendType.AWS, "T4", 16384, 0.50, availability=InstanceAvailability.AVAILABLE + ) + no_balance_offer = create_gpu_offer( + BackendType.AWS, "L4", 24 * 1024, 1.0, availability=InstanceAvailability.NO_BALANCE + ) + offers_by_backend = {BackendType.AWS: [available_offer, no_balance_offer]} + mocked_backends = create_mock_backends_with_offers(offers_by_backend) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = mocked_backends + response = await call_gpus_api( + client, project.name, user.token, run_spec, client_version=client_version + ) + + assert response.status_code == 200 + response_data = response.json() + assert len(response_data["gpus"]) == 2 + assert response_data["gpus"][0]["availability"] == [InstanceAvailability.AVAILABLE.value] + assert response_data["gpus"][1]["availability"] == [expected_availability.value] diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 4f3ab2ed2d..627fa8a167 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -1280,6 +1280,79 @@ async def test_returns_run_plan_instance_volumes( assert response.status_code == 200, response.json() assert response.json() == run_plan_dict + @pytest.mark.parametrize( + ("client_version", "expected_availability"), + [ + ("0.20.3", InstanceAvailability.NOT_AVAILABLE), + ("0.20.4", InstanceAvailability.NO_BALANCE), + (None, InstanceAvailability.NO_BALANCE), + ], + ) + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_replaces_no_balance_with_not_available_for_old_clients( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + client_version: Optional[str], + expected_availability: InstanceAvailability, + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + fleet_spec = get_fleet_spec() + fleet_spec.configuration.nodes = FleetNodesSpec(min=0, target=0, max=None) + await create_fleet(session=session, project=project, spec=fleet_spec) + repo = await create_repo(session=session, project_id=project.id) + offers = [ + InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance-1", + resources=Resources(cpus=1, memory_mib=512, spot=False, gpus=[]), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + ), + InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance-2", + resources=Resources(cpus=2, memory_mib=1024, spot=False, gpus=[]), + ), + region="us", + price=2.0, + availability=InstanceAvailability.NO_BALANCE, + ), + ] + run_plan_dict = get_dev_env_run_plan_dict( + project_name=project.name, + username=user.name, + repo_id=repo.name, + offers=offers, + total_offers=1, + max_price=1.0, + ) + body = {"run_spec": run_plan_dict["run_spec"]} + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value.get_offers.return_value = offers + m.return_value = [backend_mock] + response = await client.post( + f"/api/project/{project.name}/runs/get_plan", + headers=headers, + json=body, + ) + offers = response.json()["job_plans"][0]["offers"] + assert len(offers) == 2 + assert offers[0]["availability"] == InstanceAvailability.AVAILABLE.value + assert offers[1]["availability"] == expected_availability.value + @pytest.mark.asyncio @pytest.mark.parametrize( ("old_conf", "new_conf", "action"), diff --git a/src/tests/_internal/server/test_app.py b/src/tests/_internal/server/test_app.py index 8f11660d35..4fafb04e31 100644 --- a/src/tests/_internal/server/test_app.py +++ b/src/tests/_internal/server/test_app.py @@ -1,9 +1,14 @@ +from typing import Optional +from unittest.mock import patch + import pytest from fastapi.testclient import TestClient from httpx import AsyncClient from sqlalchemy.ext.asyncio import AsyncSession +from dstack._internal import settings from dstack._internal.server.main import app +from dstack._internal.server.testing.common import create_user, get_auth_headers client = TestClient(app) @@ -16,3 +21,78 @@ async def test_returns_html(self, test_db, session: AsyncSession, client: AsyncC response = await client.get("/") assert response.status_code == 200 assert response.content.startswith(b'<') + + +class TestCheckXApiVersion: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + ("client_version", "server_version", "is_compatible"), + [ + ("12.12.12", None, True), + ("0.12.4", "0.12.4", True), + (None, "0.1.12", True), + ("0.13.0", "0.12.4", False), + # For test performance, only a few cases are covered here. + # More cases are covered in `TestCheckClientServerCompatibility`. + ], + ) + @pytest.mark.parametrize("endpoint", ["/api/users/list", "/api/projects/list"]) + async def test_check_client_compatibility( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + endpoint: str, + client_version: Optional[str], + server_version: Optional[str], + is_compatible: bool, + ): + user = await create_user(session=session) + headers = get_auth_headers(user.token) + if client_version is not None: + headers["X-API-Version"] = client_version + + with patch.object(settings, "DSTACK_VERSION", server_version): + response = await client.post(endpoint, headers=headers, json={}) + + if is_compatible: + assert response.status_code == 200, response.text + else: + assert response.status_code == 400 + assert response.json() == { + "detail": [ + { + "code": "error", + "msg": f"The client/CLI version ({client_version}) is incompatible with the server version ({server_version}).", + } + ] + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize("endpoint", ["/api/users/list", "/api/projects/list"]) + @pytest.mark.parametrize("invalid_value", ["", "1..0", "version1"]) + async def test_invalid_x_api_version_header( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + endpoint: str, + invalid_value: str, + ): + user = await create_user(session=session) + headers = get_auth_headers(user.token) + headers["X-API-Version"] = invalid_value + + response = await client.post(endpoint, headers=headers, json={}) + + assert response.status_code == 400 + assert response.json() == { + "detail": [ + { + "code": None, + "msg": f"Invalid version: {invalid_value}", + } + ] + } diff --git a/src/tests/_internal/server/utils/test_routers.py b/src/tests/_internal/server/utils/test_routers.py index d3ea11213c..0aeb4be8b8 100644 --- a/src/tests/_internal/server/utils/test_routers.py +++ b/src/tests/_internal/server/utils/test_routers.py @@ -2,69 +2,51 @@ import packaging.version import pytest +from fastapi import HTTPException from dstack._internal.server.utils.routers import check_client_server_compatibility class TestCheckClientServerCompatibility: - @pytest.mark.parametrize("client_version", [packaging.version.parse("12.12.12"), None]) - def test_returns_none_if_server_version_is_none( - self, client_version: Optional[packaging.version.Version] - ): - assert ( - check_client_server_compatibility( - client_version=client_version, - server_version=None, - ) - is None - ) - @pytest.mark.parametrize( - "client_version,server_version", + ("client_version", "server_version"), [ + ("0.12.5", "0.12.4"), + ("0.12.5rc1", "0.12.4"), + ("0.12.4rc1", "0.12.4"), ("0.12.4", "0.12.4"), ("0.12.4", "0.12.5"), ("0.12.4", "0.13.0"), ("0.12.4", "1.12.0"), ("0.12.4", "0.12.5rc1"), ("1.0.5", "1.0.6"), + ("12.12.12", None), + (None, "0.1.12"), + (None, None), ], ) - def test_returns_none_if_compatible(self, client_version: str, server_version: str): - assert ( - check_client_server_compatibility( - client_version=packaging.version.parse(client_version), - server_version=server_version, - ) - is None - ) + def test_compatible( + self, client_version: Optional[str], server_version: Optional[str] + ) -> None: + parsed_client_version = None + if client_version is not None: + parsed_client_version = packaging.version.parse(client_version) - @pytest.mark.parametrize( - "client_version,server_version", - [ - ("0.13.0", "0.12.4"), - ("1.12.0", "0.12.0"), - ], - ) - def test_returns_error_if_client_version_larger( - self, client_version: str, server_version: str - ): - res = check_client_server_compatibility( - client_version=packaging.version.parse(client_version), + check_client_server_compatibility( + client_version=parsed_client_version, server_version=server_version, ) - assert res is not None @pytest.mark.parametrize( - "server_version", + ("client_version", "server_version"), [ - None, - "0.1.12", + ("0.13.0", "0.12.4"), + ("1.12.0", "0.12.0"), ], ) - def test_returns_none_if_client_version_is_latest(self, server_version: Optional[str]): - res = check_client_server_compatibility( - client_version=None, - server_version=server_version, - ) - assert res is None + def test_incompatible(self, client_version: str, server_version: str) -> None: + with pytest.raises(HTTPException): + check_client_server_compatibility( + client_version=packaging.version.parse(client_version), + server_version=server_version, + ) From ad6423dff6c571871f9590a7b249d18f7ff9d3ed Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Thu, 15 Jan 2026 14:16:55 +0500 Subject: [PATCH 050/187] Optimize job submissions loading (#3466) * Optimize process_running_jobs select * Optimize process_runs select * Add test_calculates_retry_duration_since_last_successful_submission * Fix _should_retry_job --- .../background/tasks/process_running_jobs.py | 72 ++++++++---- .../server/background/tasks/process_runs.py | 108 +++++++++++++----- .../background/tasks/test_process_runs.py | 45 +++++++- 3 files changed, 171 insertions(+), 54 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py index 341b47a38b..f5ca6c61ae 100644 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_running_jobs.py @@ -5,9 +5,9 @@ from datetime import timedelta from typing import Dict, List, Optional -from sqlalchemy import select +from sqlalchemy import and_, func, select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload, load_only +from sqlalchemy.orm import aliased, contains_eager, joinedload, load_only from dstack._internal import settings from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HTTP_PORT @@ -139,25 +139,8 @@ async def _process_next_running_job(): async def _process_running_job(session: AsyncSession, job_model: JobModel): - # Refetch to load related attributes. - res = await session.execute( - select(JobModel) - .where(JobModel.id == job_model.id) - .options(joinedload(JobModel.instance).joinedload(InstanceModel.project)) - .options(joinedload(JobModel.probes).load_only(ProbeModel.success_streak)) - .execution_options(populate_existing=True) - ) - job_model = res.unique().scalar_one() - res = await session.execute( - select(RunModel) - .where(RunModel.id == job_model.run_id) - .options(joinedload(RunModel.project)) - .options(joinedload(RunModel.user)) - .options(joinedload(RunModel.repo)) - .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name)) - .options(joinedload(RunModel.jobs)) - ) - run_model = res.unique().scalar_one() + job_model = await _refetch_job_model(session, job_model) + run_model = await _fetch_run_model(session, job_model.run_id) repo_model = run_model.repo project = run_model.project run = run_model_to_run(run_model, include_sensitive=True) @@ -421,6 +404,53 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel): await session.commit() +async def _refetch_job_model(session: AsyncSession, job_model: JobModel) -> JobModel: + res = await session.execute( + select(JobModel) + .where(JobModel.id == job_model.id) + .options(joinedload(JobModel.instance).joinedload(InstanceModel.project)) + .options(joinedload(JobModel.probes).load_only(ProbeModel.success_streak)) + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one() + + +async def _fetch_run_model(session: AsyncSession, run_id: uuid.UUID) -> RunModel: + # Select only latest submissions for every job. + latest_submissions_sq = ( + select( + JobModel.run_id.label("run_id"), + JobModel.replica_num.label("replica_num"), + JobModel.job_num.label("job_num"), + func.max(JobModel.submission_num).label("max_submission_num"), + ) + .where(JobModel.run_id == run_id) + .group_by(JobModel.run_id, JobModel.replica_num, JobModel.job_num) + .subquery() + ) + job_alias = aliased(JobModel) + res = await session.execute( + select(RunModel) + .where(RunModel.id == run_id) + .join(job_alias, job_alias.run_id == RunModel.id) + .join( + latest_submissions_sq, + onclause=and_( + job_alias.run_id == latest_submissions_sq.c.run_id, + job_alias.replica_num == latest_submissions_sq.c.replica_num, + job_alias.job_num == latest_submissions_sq.c.job_num, + job_alias.submission_num == latest_submissions_sq.c.max_submission_num, + ), + ) + .options(joinedload(RunModel.project)) + .options(joinedload(RunModel.user)) + .options(joinedload(RunModel.repo)) + .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name)) + .options(contains_eager(RunModel.jobs, alias=job_alias)) + ) + return res.unique().scalar_one() + + async def _wait_for_instance_provisioning_data(session: AsyncSession, job_model: JobModel): """ This function will be called until instance IP address appears diff --git a/src/dstack/_internal/server/background/tasks/process_runs.py b/src/dstack/_internal/server/background/tasks/process_runs.py index af2dcee8d8..b4397b95e0 100644 --- a/src/dstack/_internal/server/background/tasks/process_runs.py +++ b/src/dstack/_internal/server/background/tasks/process_runs.py @@ -2,9 +2,9 @@ import datetime from typing import List, Optional, Set, Tuple -from sqlalchemy import and_, or_, select +from sqlalchemy import and_, func, or_, select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload, load_only, selectinload +from sqlalchemy.orm import aliased, contains_eager, joinedload, load_only import dstack._internal.server.services.services.autoscalers as autoscalers from dstack._internal.core.errors import ServerError @@ -33,6 +33,7 @@ get_job_specs_from_run_spec, group_jobs_by_replica_latest, is_master_job, + job_model_to_job_submission, switch_job_status, ) from dstack._internal.server.services.locking import get_locker @@ -144,22 +145,7 @@ async def _process_next_run(): async def _process_run(session: AsyncSession, run_model: RunModel): - # Refetch to load related attributes. - res = await session.execute( - select(RunModel) - .where(RunModel.id == run_model.id) - .execution_options(populate_existing=True) - .options(joinedload(RunModel.project).load_only(ProjectModel.id, ProjectModel.name)) - .options(joinedload(RunModel.user).load_only(UserModel.name)) - .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name)) - .options( - selectinload(RunModel.jobs) - .joinedload(JobModel.instance) - .load_only(InstanceModel.fleet_id) - ) - .execution_options(populate_existing=True) - ) - run_model = res.unique().scalar_one() + run_model = await _refetch_run_model(session, run_model) logger.debug("%s: processing run", fmt(run_model)) try: if run_model.status == RunStatus.PENDING: @@ -181,6 +167,46 @@ async def _process_run(session: AsyncSession, run_model: RunModel): await session.commit() +async def _refetch_run_model(session: AsyncSession, run_model: RunModel) -> RunModel: + # Select only latest submissions for every job. + latest_submissions_sq = ( + select( + JobModel.run_id.label("run_id"), + JobModel.replica_num.label("replica_num"), + JobModel.job_num.label("job_num"), + func.max(JobModel.submission_num).label("max_submission_num"), + ) + .where(JobModel.run_id == run_model.id) + .group_by(JobModel.run_id, JobModel.replica_num, JobModel.job_num) + .subquery() + ) + job_alias = aliased(JobModel) + res = await session.execute( + select(RunModel) + .where(RunModel.id == run_model.id) + .outerjoin(latest_submissions_sq, latest_submissions_sq.c.run_id == RunModel.id) + .outerjoin( + job_alias, + onclause=and_( + job_alias.run_id == latest_submissions_sq.c.run_id, + job_alias.replica_num == latest_submissions_sq.c.replica_num, + job_alias.job_num == latest_submissions_sq.c.job_num, + job_alias.submission_num == latest_submissions_sq.c.max_submission_num, + ), + ) + .options(joinedload(RunModel.project).load_only(ProjectModel.id, ProjectModel.name)) + .options(joinedload(RunModel.user).load_only(UserModel.name)) + .options(joinedload(RunModel.fleet).load_only(FleetModel.id, FleetModel.name)) + .options( + contains_eager(RunModel.jobs, alias=job_alias) + .joinedload(JobModel.instance) + .load_only(InstanceModel.fleet_id) + ) + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one() + + async def _process_pending_run(session: AsyncSession, run_model: RunModel): """Jobs are not created yet""" run = run_model_to_run(run_model) @@ -294,7 +320,7 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel): and job_model.termination_reason not in {JobTerminationReason.DONE_BY_RUNNER, JobTerminationReason.SCALED_DOWN} ): - current_duration = _should_retry_job(run, job, job_model) + current_duration = await _should_retry_job(session, run, job, job_model) if current_duration is None: replica_statuses.add(RunStatus.FAILED) run_termination_reasons.add(RunTerminationReason.JOB_FAILED) @@ -552,19 +578,44 @@ def _has_out_of_date_replicas(run: RunModel) -> bool: return False -def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datetime.timedelta]: +async def _should_retry_job( + session: AsyncSession, + run: Run, + job: Job, + job_model: JobModel, +) -> Optional[datetime.timedelta]: """ Checks if the job should be retried. Returns the current duration of retrying if retry is enabled. + Retrying duration is calculated as the time since `last_processed_at` + of the latest provisioned submission. """ if job.job_spec.retry is None: return None last_provisioned_submission = None - for job_submission in reversed(job.job_submissions): - if job_submission.job_provisioning_data is not None: - last_provisioned_submission = job_submission - break + if len(job.job_submissions) > 0: + last_submission = job.job_submissions[-1] + if last_submission.job_provisioning_data is not None: + last_provisioned_submission = last_submission + else: + # The caller passes at most one latest submission in job.job_submissions, so check the db. + res = await session.execute( + select(JobModel) + .where( + JobModel.run_id == job_model.run_id, + JobModel.replica_num == job_model.replica_num, + JobModel.job_num == job_model.job_num, + JobModel.job_provisioning_data.is_not(None), + ) + .order_by(JobModel.last_processed_at.desc()) + .limit(1) + ) + last_provisioned_submission_model = res.scalar() + if last_provisioned_submission_model is not None: + last_provisioned_submission = job_model_to_job_submission( + last_provisioned_submission_model + ) if ( job_model.termination_reason is not None @@ -574,13 +625,10 @@ def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datet ): return common.get_current_datetime() - run.submitted_at - if last_provisioned_submission is None: - return None - if ( - last_provisioned_submission.termination_reason is not None - and JobTerminationReason(last_provisioned_submission.termination_reason).to_retry_event() - in job.job_spec.retry.on_events + job_model.termination_reason is not None + and job_model.termination_reason.to_retry_event() in job.job_spec.retry.on_events + and last_provisioned_submission is not None ): return common.get_current_datetime() - last_provisioned_submission.last_processed_at diff --git a/src/tests/_internal/server/background/tasks/test_process_runs.py b/src/tests/_internal/server/background/tasks/test_process_runs.py index 81c1ef0026..46aaa9b48e 100644 --- a/src/tests/_internal/server/background/tasks/test_process_runs.py +++ b/src/tests/_internal/server/background/tasks/test_process_runs.py @@ -1,6 +1,6 @@ import datetime from collections.abc import Iterable -from typing import Union, cast +from typing import Optional, Union, cast from unittest.mock import patch import pytest @@ -15,7 +15,7 @@ TaskConfiguration, ) from dstack._internal.core.models.instances import InstanceStatus -from dstack._internal.core.models.profiles import Profile, ProfileRetry, Schedule +from dstack._internal.core.models.profiles import Profile, ProfileRetry, RetryEvent, Schedule from dstack._internal.core.models.resources import Range from dstack._internal.core.models.runs import ( JobSpec, @@ -48,6 +48,7 @@ async def make_run( deployment_num: int = 0, image: str = "ubuntu:latest", probes: Iterable[ProbeConfig] = (), + retry: Optional[ProfileRetry] = None, ) -> RunModel: project = await create_project(session=session) user = await create_user(session=session) @@ -58,7 +59,7 @@ async def make_run( run_name = "test-run" profile = Profile( name="test-profile", - retry=True, + retry=retry or True, ) run_spec = get_run_spec( repo_id=repo.name, @@ -230,6 +231,44 @@ async def test_retry_running_to_failed(self, test_db, session: AsyncSession): assert run.status == RunStatus.TERMINATING assert run.termination_reason == RunTerminationReason.JOB_FAILED + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_calculates_retry_duration_since_last_successful_submission( + self, test_db, session: AsyncSession + ): + run = await make_run( + session, + status=RunStatus.RUNNING, + replicas=1, + retry=ProfileRetry(duration=300, on_events=[RetryEvent.NO_CAPACITY]), + ) + now = run.submitted_at + datetime.timedelta(minutes=10) + # Retry logic should look at this job and calculate retry duration since its last_processed_at. + await create_job( + session=session, + run=run, + status=JobStatus.FAILED, + termination_reason=JobTerminationReason.EXECUTOR_ERROR, + last_processed_at=now - datetime.timedelta(minutes=4), + replica_num=0, + job_provisioning_data=get_job_provisioning_data(), + ) + await create_job( + session=session, + run=run, + status=JobStatus.FAILED, + termination_reason=JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY, + replica_num=0, + submission_num=1, + last_processed_at=now - datetime.timedelta(minutes=2), + job_provisioning_data=None, + ) + with patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock: + datetime_mock.return_value = now + await process_runs.process_runs() + await session.refresh(run) + assert run.status == RunStatus.PENDING + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_pending_to_submitted(self, test_db, session: AsyncSession): From 8b383ba662b5565ce21ecf954916e0877af89cda Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 15 Jan 2026 10:15:29 +0000 Subject: [PATCH 051/187] [CLI] Add `--memory` option to `apply` and `offer` (#3461) --- src/dstack/_internal/cli/commands/offer.py | 26 +-------- .../cli/services/configurators/run.py | 36 ++----------- .../_internal/cli/services/resources.py | 54 +++++++++++++++++++ 3 files changed, 60 insertions(+), 56 deletions(-) create mode 100644 src/dstack/_internal/cli/services/resources.py diff --git a/src/dstack/_internal/cli/commands/offer.py b/src/dstack/_internal/cli/commands/offer.py index bc6bb0a5db..0e4be1d5c2 100644 --- a/src/dstack/_internal/cli/commands/offer.py +++ b/src/dstack/_internal/cli/commands/offer.py @@ -3,11 +3,11 @@ from typing import List, Literal, cast from dstack._internal.cli.commands import APIBaseCommand -from dstack._internal.cli.services.args import cpu_spec, disk_spec, gpu_spec from dstack._internal.cli.services.configurators.run import ( BaseRunConfigurator, ) from dstack._internal.cli.services.profile import register_profile_args +from dstack._internal.cli.services.resources import register_resources_args from dstack._internal.cli.utils.common import console from dstack._internal.cli.utils.gpu import print_gpu_json, print_gpu_table from dstack._internal.cli.utils.run import print_offers_json, print_run_plan @@ -47,29 +47,7 @@ def register_args(cls, parser: argparse.ArgumentParser): default=50, ) cls.register_env_args(configuration_group) - configuration_group.add_argument( - "--cpu", - type=cpu_spec, - help="Request CPU for the run. " - "The format is [code]ARCH[/]:[code]COUNT[/] (all parts are optional)", - dest="cpu_spec", - metavar="SPEC", - ) - configuration_group.add_argument( - "--gpu", - type=gpu_spec, - help="Request GPU for the run. " - "The format is [code]NAME[/]:[code]COUNT[/]:[code]MEMORY[/] (all parts are optional)", - dest="gpu_spec", - metavar="SPEC", - ) - configuration_group.add_argument( - "--disk", - type=disk_spec, - help="Request the size range of disk for the run. Example [code]--disk 100GB..[/].", - metavar="RANGE", - dest="disk_spec", - ) + register_resources_args(configuration_group) register_profile_args(parser) diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py index 3d126dd34f..fc76fe43ed 100644 --- a/src/dstack/_internal/cli/services/configurators/run.py +++ b/src/dstack/_internal/cli/services/configurators/run.py @@ -12,8 +12,7 @@ import gpuhunt from pydantic import parse_obj_as -import dstack._internal.core.models.resources as resources -from dstack._internal.cli.services.args import cpu_spec, disk_spec, gpu_spec, port_mapping +from dstack._internal.cli.services.args import port_mapping from dstack._internal.cli.services.configurators.base import ( ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator, @@ -26,6 +25,7 @@ is_git_repo_url, register_init_repo_args, ) +from dstack._internal.cli.services.resources import apply_resources_args, register_resources_args from dstack._internal.cli.utils.common import confirm_ask, console from dstack._internal.cli.utils.rich import MultiItemStatus from dstack._internal.cli.utils.run import get_runs_table, print_run_plan @@ -309,29 +309,7 @@ def register_args(cls, parser: argparse.ArgumentParser): default=3, ) cls.register_env_args(configuration_group) - configuration_group.add_argument( - "--cpu", - type=cpu_spec, - help="Request CPU for the run. " - "The format is [code]ARCH[/]:[code]COUNT[/] (all parts are optional)", - dest="cpu_spec", - metavar="SPEC", - ) - configuration_group.add_argument( - "--gpu", - type=gpu_spec, - help="Request GPU for the run. " - "The format is [code]NAME[/]:[code]COUNT[/]:[code]MEMORY[/] (all parts are optional)", - dest="gpu_spec", - metavar="SPEC", - ) - configuration_group.add_argument( - "--disk", - type=disk_spec, - help="Request the size range of disk for the run. Example [code]--disk 100GB..[/].", - metavar="RANGE", - dest="disk_spec", - ) + register_resources_args(configuration_group) register_profile_args(parser) repo_group = parser.add_argument_group("Repo Options") repo_group.add_argument( @@ -359,16 +337,10 @@ def register_args(cls, parser: argparse.ArgumentParser): register_init_repo_args(repo_group) def apply_args(self, conf: RunConfigurationT, args: argparse.Namespace): + apply_resources_args(args, conf) apply_profile_args(args, conf) if args.run_name: conf.name = args.run_name - if args.cpu_spec: - conf.resources.cpu = resources.CPUSpec.parse_obj(args.cpu_spec) - if args.gpu_spec: - conf.resources.gpu = resources.GPUSpec.parse_obj(args.gpu_spec) - if args.disk_spec: - conf.resources.disk = args.disk_spec - self.apply_env_vars(conf.env, args) self.interpolate_env(conf) diff --git a/src/dstack/_internal/cli/services/resources.py b/src/dstack/_internal/cli/services/resources.py new file mode 100644 index 0000000000..e81b6078db --- /dev/null +++ b/src/dstack/_internal/cli/services/resources.py @@ -0,0 +1,54 @@ +import argparse + +from dstack._internal.cli.services.args import cpu_spec, disk_spec, gpu_spec, memory_spec +from dstack._internal.cli.services.configurators.base import ArgsParser +from dstack._internal.core.models import resources +from dstack._internal.core.models.configurations import AnyRunConfiguration + + +def register_resources_args(parser: ArgsParser) -> None: + parser.add_argument( + "--cpu", + type=cpu_spec, + help=( + "Request CPU for the run." + " The format is [code]ARCH[/]:[code]COUNT[/] (all parts are optional)" + ), + dest="cpu_spec", + metavar="SPEC", + ) + parser.add_argument( + "--gpu", + type=gpu_spec, + help=( + "Request GPU for the run." + " The format is [code]NAME[/]:[code]COUNT[/]:[code]MEMORY[/] (all parts are optional)" + ), + dest="gpu_spec", + metavar="SPEC", + ) + parser.add_argument( + "--memory", + type=memory_spec, + help="Request the size range of RAM for the run. Example [code]--memory 128GB..256GB[/]", + dest="memory_spec", + metavar="RANGE", + ) + parser.add_argument( + "--disk", + type=disk_spec, + help="Request the size range of disk for the run. Example [code]--disk 100GB..[/]", + dest="disk_spec", + metavar="RANGE", + ) + + +def apply_resources_args(args: argparse.Namespace, conf: AnyRunConfiguration) -> None: + if args.cpu_spec: + conf.resources.cpu = resources.CPUSpec.parse_obj(args.cpu_spec) + if args.gpu_spec: + conf.resources.gpu = resources.GPUSpec.parse_obj(args.gpu_spec) + if args.memory_spec: + conf.resources.memory = args.memory_spec + if args.disk_spec: + conf.resources.disk = args.disk_spec From a26c67b3b80c578043742e24270581102ded2460 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 15 Jan 2026 13:20:00 +0000 Subject: [PATCH 052/187] [runner] Rework and fix user processing (#3456) * Drop --home-dir option, use process user's home dir instead * Fix ownership of Git credentials, consider Git credentials errors non-fatal Closes: https://github.com/dstackai/dstack/issues/3419 --- runner/cmd/runner/main.go | 47 +- runner/consts/consts.go | 4 - runner/internal/common/utils.go | 4 +- runner/internal/common/utils_test.go | 8 +- runner/internal/executor/executor.go | 426 ++++++------------ runner/internal/executor/executor_test.go | 20 +- runner/internal/executor/files.go | 25 +- runner/internal/executor/repo.go | 20 +- runner/internal/executor/user.go | 184 ++++++++ runner/internal/executor/user_test.go | 232 ++++++++++ runner/internal/linux/user/user.go | 96 ++++ runner/internal/schemas/schemas.go | 16 - runner/internal/shim/docker.go | 3 - .../_internal/core/backends/base/compute.py | 4 - .../core/backends/kubernetes/compute.py | 1 - .../_internal/server/services/proxy/repo.py | 2 +- src/dstack/_internal/server/services/ssh.py | 2 +- 17 files changed, 715 insertions(+), 379 deletions(-) create mode 100644 runner/internal/executor/user.go create mode 100644 runner/internal/executor/user_test.go create mode 100644 runner/internal/linux/user/user.go diff --git a/runner/cmd/runner/main.go b/runner/cmd/runner/main.go index 27e529417a..c2ed94f0eb 100644 --- a/runner/cmd/runner/main.go +++ b/runner/cmd/runner/main.go @@ -16,6 +16,7 @@ import ( "github.com/dstackai/dstack/runner/consts" "github.com/dstackai/dstack/runner/internal/executor" + linuxuser "github.com/dstackai/dstack/runner/internal/linux/user" "github.com/dstackai/dstack/runner/internal/log" "github.com/dstackai/dstack/runner/internal/runner/api" "github.com/dstackai/dstack/runner/internal/ssh" @@ -30,7 +31,6 @@ func main() { func mainInner() int { var tempDir string - var homeDir string var httpPort int var sshPort int var sshAuthorizedKeys []string @@ -61,13 +61,6 @@ func mainInner() int { Destination: &tempDir, TakesFile: true, }, - &cli.StringFlag{ - Name: "home-dir", - Usage: "HomeDir directory for credentials and $HOME", - Value: consts.RunnerHomeDir, - Destination: &homeDir, - TakesFile: true, - }, &cli.IntFlag{ Name: "http-port", Usage: "Set a http port", @@ -87,7 +80,7 @@ func mainInner() int { }, }, Action: func(ctx context.Context, cmd *cli.Command) error { - return start(ctx, tempDir, homeDir, httpPort, sshPort, sshAuthorizedKeys, logLevel, Version) + return start(ctx, tempDir, httpPort, sshPort, sshAuthorizedKeys, logLevel, Version) }, }, }, @@ -104,7 +97,7 @@ func mainInner() int { return 0 } -func start(ctx context.Context, tempDir string, homeDir string, httpPort int, sshPort int, sshAuthorizedKeys []string, logLevel int, version string) error { +func start(ctx context.Context, tempDir string, httpPort int, sshPort int, sshAuthorizedKeys []string, logLevel int, version string) error { if err := os.MkdirAll(tempDir, 0o755); err != nil { return fmt.Errorf("create temp directory: %w", err) } @@ -114,15 +107,39 @@ func start(ctx context.Context, tempDir string, homeDir string, httpPort int, ss return fmt.Errorf("create default log file: %w", err) } defer func() { - closeErr := defaultLogFile.Close() - if closeErr != nil { - log.Error(ctx, "Failed to close default log file", "err", closeErr) + if err := defaultLogFile.Close(); err != nil { + log.Error(ctx, "Failed to close default log file", "err", err) } }() - log.DefaultEntry.Logger.SetOutput(io.MultiWriter(os.Stdout, defaultLogFile)) log.DefaultEntry.Logger.SetLevel(logrus.Level(logLevel)) + currentUser, err := linuxuser.FromCurrentProcess() + if err != nil { + return fmt.Errorf("get current process user: %w", err) + } + if !currentUser.IsRoot() { + return fmt.Errorf("must be root: %s", currentUser) + } + if currentUser.HomeDir == "" { + log.Warning(ctx, "Current user does not have home dir, using /root as a fallback", "user", currentUser) + currentUser.HomeDir = "/root" + } + // Fix the current process HOME, just in case some internals require it (e.g., they use os.UserHomeDir() or + // spawn a child process which uses that variable) + envHome, envHomeIsSet := os.LookupEnv("HOME") + if envHome != currentUser.HomeDir { + if !envHomeIsSet { + log.Warning(ctx, "HOME is not set, setting the value", "home", currentUser.HomeDir) + } else { + log.Warning(ctx, "HOME is incorrect, fixing the value", "current", envHome, "home", currentUser.HomeDir) + } + if err := os.Setenv("HOME", currentUser.HomeDir); err != nil { + return fmt.Errorf("set HOME: %w", err) + } + } + log.Trace(ctx, "Running as", "user", currentUser) + // NB: The Mkdir/Chown/Chmod code below relies on the fact that RunnerDstackDir path is _not_ nested (/dstack). // Adjust it if the path is changed to, e.g., /opt/dstack const dstackDir = consts.RunnerDstackDir @@ -163,7 +180,7 @@ func start(ctx context.Context, tempDir string, homeDir string, httpPort int, ss } }() - ex, err := executor.NewRunExecutor(tempDir, homeDir, dstackDir, sshd) + ex, err := executor.NewRunExecutor(tempDir, dstackDir, *currentUser, sshd) if err != nil { return fmt.Errorf("create executor: %w", err) } diff --git a/runner/consts/consts.go b/runner/consts/consts.go index 4da4a139f7..99f405c29d 100644 --- a/runner/consts/consts.go +++ b/runner/consts/consts.go @@ -26,10 +26,6 @@ const ( // NOTE: RunnerRuntimeDir would be a more appropriate name, but it's called tempDir // throughout runner's codebase RunnerTempDir = "/tmp/runner" - // Currently, it's a directory where authorized_keys, git credentials, etc. are placed - // The current user's homedir (as of 2024-12-28, it's always root) should be used - // instead of the hardcoded value - RunnerHomeDir = "/root" // A directory for: // 1. Files used by the runner and related components (e.g., sshd stores its config and log inside /dstack/ssh) // 2. Files shared between users (e.g., sshd authorized_keys, MPI hostfile) diff --git a/runner/internal/common/utils.go b/runner/internal/common/utils.go index 2582799704..5be68edf70 100644 --- a/runner/internal/common/utils.go +++ b/runner/internal/common/utils.go @@ -49,7 +49,7 @@ func ExpandPath(pth string, base string, home string) (string, error) { return pth, nil } -func MkdirAll(ctx context.Context, pth string, uid int, gid int) error { +func MkdirAll(ctx context.Context, pth string, uid int, gid int, perm os.FileMode) error { paths := []string{pth} for { pth = path.Dir(pth) @@ -60,7 +60,7 @@ func MkdirAll(ctx context.Context, pth string, uid int, gid int) error { } for _, p := range slices.Backward(paths) { if _, err := os.Stat(p); errors.Is(err, os.ErrNotExist) { - if err := os.Mkdir(p, 0o755); err != nil { + if err := os.Mkdir(p, perm); err != nil { return err } if uid != -1 || gid != -1 { diff --git a/runner/internal/common/utils_test.go b/runner/internal/common/utils_test.go index a49d080a2e..5fe780d503 100644 --- a/runner/internal/common/utils_test.go +++ b/runner/internal/common/utils_test.go @@ -120,7 +120,7 @@ func TestExpandtPath_ErrorTildeUsernameNotSupported_TildeUsernameWithPath(t *tes func TestMkdirAll_AbsPath_NotExists(t *testing.T) { absPath := path.Join(t.TempDir(), "a/b/c") require.NoDirExists(t, absPath) - err := MkdirAll(context.Background(), absPath, -1, -1) + err := MkdirAll(context.Background(), absPath, -1, -1, 0o755) require.NoError(t, err) require.DirExists(t, absPath) } @@ -128,7 +128,7 @@ func TestMkdirAll_AbsPath_NotExists(t *testing.T) { func TestMkdirAll_AbsPath_Exists(t *testing.T) { absPath, err := os.Getwd() require.NoError(t, err) - err = MkdirAll(context.Background(), absPath, -1, -1) + err = MkdirAll(context.Background(), absPath, -1, -1, 0o755) require.NoError(t, err) require.DirExists(t, absPath) } @@ -139,7 +139,7 @@ func TestMkdirAll_RelPath_NotExists(t *testing.T) { relPath := "a/b/c" absPath := path.Join(cwd, relPath) require.NoDirExists(t, absPath) - err := MkdirAll(context.Background(), relPath, -1, -1) + err := MkdirAll(context.Background(), relPath, -1, -1, 0o755) require.NoError(t, err) require.DirExists(t, absPath) } @@ -151,7 +151,7 @@ func TestMkdirAll_RelPath_Exists(t *testing.T) { absPath := path.Join(cwd, relPath) err := os.MkdirAll(absPath, 0o755) require.NoError(t, err) - err = MkdirAll(context.Background(), relPath, -1, -1) + err = MkdirAll(context.Background(), relPath, -1, -1, 0o755) require.NoError(t, err) require.DirExists(t, absPath) } diff --git a/runner/internal/executor/executor.go b/runner/internal/executor/executor.go index fc4039cf96..cd3bd1be99 100644 --- a/runner/internal/executor/executor.go +++ b/runner/internal/executor/executor.go @@ -9,7 +9,6 @@ import ( "net/url" "os" "os/exec" - osuser "os/user" "path" "path/filepath" "runtime" @@ -27,6 +26,7 @@ import ( "github.com/dstackai/dstack/runner/consts" "github.com/dstackai/dstack/runner/internal/common" "github.com/dstackai/dstack/runner/internal/connections" + linuxuser "github.com/dstackai/dstack/runner/internal/linux/user" "github.com/dstackai/dstack/runner/internal/log" "github.com/dstackai/dstack/runner/internal/schemas" "github.com/dstackai/dstack/runner/internal/ssh" @@ -52,14 +52,13 @@ type ConnectionTracker interface { } type RunExecutor struct { - tempDir string - homeDir string - dstackDir string + tempDir string + dstackDir string + currentUser linuxuser.User + sshd ssh.SshdManager + fileArchiveDir string repoBlobDir string - sshd ssh.SshdManager - - currentUid uint32 run schemas.Run jobSpec schemas.JobSpec @@ -69,10 +68,9 @@ type RunExecutor struct { repoCredentials *schemas.RepoCredentials repoDir string repoBlobPath string - jobUid int - jobGid int - jobHomeDir string - jobWorkingDir string + // If the user is not specified in the JobSpec, jobUser should point to currentUser + jobUser *linuxuser.User + jobWorkingDir string mu *sync.RWMutex state string @@ -93,17 +91,9 @@ func (s *stubConnectionTracker) GetNoConnectionsSecs() int64 { return 0 } func (s *stubConnectionTracker) Track(ticker <-chan time.Time) {} func (s *stubConnectionTracker) Stop() {} -func NewRunExecutor(tempDir string, homeDir string, dstackDir string, sshd ssh.SshdManager) (*RunExecutor, error) { +func NewRunExecutor(tempDir string, dstackDir string, currentUser linuxuser.User, sshd ssh.SshdManager) (*RunExecutor, error) { mu := &sync.RWMutex{} timestamp := NewMonotonicTimestamp() - user, err := osuser.Current() - if err != nil { - return nil, fmt.Errorf("failed to get current user: %w", err) - } - uid, err := parseStringId(user.Uid) - if err != nil { - return nil, fmt.Errorf("failed to parse current user uid: %w", err) - } // Try to initialize procfs, but don't fail if it's not available (e.g., on macOS) var connectionTracker ConnectionTracker @@ -124,15 +114,13 @@ func NewRunExecutor(tempDir string, homeDir string, dstackDir string, sshd ssh.S } return &RunExecutor{ - tempDir: tempDir, - homeDir: homeDir, - dstackDir: dstackDir, + tempDir: tempDir, + dstackDir: dstackDir, + currentUser: currentUser, + sshd: sshd, + fileArchiveDir: filepath.Join(tempDir, "file_archives"), repoBlobDir: filepath.Join(tempDir, "repo_blobs"), - sshd: sshd, - currentUid: uid, - jobUid: -1, - jobGid: -1, mu: mu, state: WaitSubmit, @@ -188,29 +176,41 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { ctx = log.WithLogger(ctx, log.NewEntry(logger, int(log.DefaultEntry.Logger.Level))) // todo loglevel log.Info(ctx, "Run job", "log_level", log.GetLogger(ctx).Logger.Level.String()) - if ex.jobSpec.User == nil { - ex.jobSpec.User = &schemas.User{Uid: &ex.currentUid} - } - if err := fillUser(ex.jobSpec.User); err != nil { + if err := ex.setJobUser(ctx); err != nil { ex.SetJobStateWithTerminationReason( ctx, types.JobStateFailed, types.TerminationReasonExecutorError, - fmt.Sprintf("Failed to fill in the job user fields (%s)", err), + fmt.Sprintf("Failed to set job user (%s)", err), ) - return fmt.Errorf("fill user: %w", err) + return fmt.Errorf("set job user: %w", err) } - ex.setJobCredentials(ctx) + // setJobUser sets User.HomeDir to "/" if the original home dir is not set or not accessible, + // in that case we skip home dir provisioning + if ex.jobUser.HomeDir == "/" { + log.Info(ctx, "Skipping home dir provisioning") + } else { + // All home dir-related errors are considered non-fatal + cleanupGitCredentials, err := ex.setupGitCredentials(ctx) + if err != nil { + log.Error(ctx, "Failed to set up Git credentials", "err", err) + } else { + defer cleanupGitCredentials() + } + if err := ex.setupClusterSsh(ctx); err != nil { + log.Error(ctx, "Failed to set up cluster SSH", "err", err) + } + } if err := ex.setJobWorkingDir(ctx); err != nil { ex.SetJobStateWithTerminationReason( ctx, types.JobStateFailed, types.TerminationReasonExecutorError, - fmt.Sprintf("Failed to set up the working dir (%s)", err), + fmt.Sprintf("Failed to set job working dir (%s)", err), ) - return fmt.Errorf("prepare job working dir: %w", err) + return fmt.Errorf("set job working dir: %w", err) } if err := ex.setupRepo(ctx); err != nil { @@ -233,13 +233,6 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { return fmt.Errorf("setup files: %w", err) } - cleanupCredentials, err := ex.setupCredentials(ctx) - if err != nil { - ex.SetJobState(ctx, types.JobStateFailed) - return fmt.Errorf("setup credentials: %w", err) - } - defer cleanupCredentials() - connectionTrackerTicker := time.NewTicker(2500 * time.Millisecond) go ex.connectionTracker.Track(connectionTrackerTicker.C) defer ex.connectionTracker.Stop() @@ -339,21 +332,7 @@ func (ex *RunExecutor) SetRunnerState(state string) { ex.state = state } -func (ex *RunExecutor) setJobCredentials(ctx context.Context) { - if ex.jobSpec.User.Uid != nil { - ex.jobUid = int(*ex.jobSpec.User.Uid) - } - if ex.jobSpec.User.Gid != nil { - ex.jobGid = int(*ex.jobSpec.User.Gid) - } - if ex.jobSpec.User.HomeDir != "" { - ex.jobHomeDir = ex.jobSpec.User.HomeDir - } else { - ex.jobHomeDir = "/" - } - log.Trace(ctx, "Job credentials", "uid", ex.jobUid, "gid", ex.jobGid, "home", ex.jobHomeDir) -} - +// setJobWorkingDir must be called from Run after setJobUser func (ex *RunExecutor) setJobWorkingDir(ctx context.Context) error { var err error if ex.jobSpec.WorkingDir == nil { @@ -362,18 +341,73 @@ func (ex *RunExecutor) setJobWorkingDir(ctx context.Context) error { return fmt.Errorf("get working directory: %w", err) } } else { - ex.jobWorkingDir, err = common.ExpandPath(*ex.jobSpec.WorkingDir, "", ex.jobHomeDir) + ex.jobWorkingDir, err = common.ExpandPath(*ex.jobSpec.WorkingDir, "", ex.jobUser.HomeDir) if err != nil { return fmt.Errorf("expand working dir path: %w", err) } if !path.IsAbs(ex.jobWorkingDir) { - return fmt.Errorf("working_dir must be absolute: %s", ex.jobWorkingDir) + return fmt.Errorf("working dir must be absolute: %s", ex.jobWorkingDir) } } log.Trace(ctx, "Job working dir", "path", ex.jobWorkingDir) return nil } +// setupClusterSsh must be called from Run after setJobUser +func (ex *RunExecutor) setupClusterSsh(ctx context.Context) error { + if ex.jobSpec.SSHKey == nil || len(ex.clusterInfo.JobIPs) < 2 { + return nil + } + + sshDir, err := prepareUserSshDir(ex.jobUser) + if err != nil { + return fmt.Errorf("prepare user ssh dir: %w", err) + } + + privatePath := filepath.Join(sshDir, "dstack_job") + privateFile, err := os.OpenFile(privatePath, os.O_TRUNC|os.O_WRONLY|os.O_CREATE, 0o600) + if err != nil { + return fmt.Errorf("open private key file: %w", err) + } + defer privateFile.Close() + if err := os.Chown(privatePath, ex.jobUser.Uid, ex.jobUser.Uid); err != nil { + return fmt.Errorf("chown private key: %w", err) + } + if _, err := privateFile.WriteString(ex.jobSpec.SSHKey.Private); err != nil { + return fmt.Errorf("write private key: %w", err) + } + + // TODO: move job hosts config to ~/.dstack/ssh/config.d/current_job.conf + // and add "Include ~/.dstack/ssh/config.d/*.conf" directive to ~/.ssh/config if not present + // instead of appending job hosts config directly (don't bloat user's ssh_config) + configPath := filepath.Join(sshDir, "config") + configFile, err := os.OpenFile(configPath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0o600) + if err != nil { + return fmt.Errorf("open SSH config: %w", err) + } + defer configFile.Close() + if err := os.Chown(configPath, ex.jobUser.Uid, ex.jobUser.Gid); err != nil { + return fmt.Errorf("chown SSH config: %w", err) + } + configBuffer := new(bytes.Buffer) + for _, ip := range ex.clusterInfo.JobIPs { + fmt.Fprintf(configBuffer, "\nHost %s\n", ip) + fmt.Fprintf(configBuffer, " Port %d\n", ex.sshd.Port()) + configBuffer.WriteString(" StrictHostKeyChecking no\n") + configBuffer.WriteString(" UserKnownHostsFile /dev/null\n") + fmt.Fprintf(configBuffer, " IdentityFile %s\n", privatePath) + } + if _, err := configFile.Write(configBuffer.Bytes()); err != nil { + return fmt.Errorf("write SSH config: %w", err) + } + + if err := ex.sshd.AddAuthorizedKeys(ctx, ex.jobSpec.SSHKey.Public); err != nil { + return fmt.Errorf("add authorized key: %w", err) + } + + return nil +} + func (ex *RunExecutor) getRepoData() schemas.RepoData { if ex.jobSpec.RepoData == nil { // jobs submitted before 0.19.17 do not have jobSpec.RepoData @@ -425,33 +459,26 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error } cmd.WaitDelay = ex.killDelay // kills the process if it doesn't exit in time - if err := common.MkdirAll(ctx, ex.jobWorkingDir, ex.jobUid, ex.jobGid); err != nil { + if err := common.MkdirAll(ctx, ex.jobWorkingDir, ex.jobUser.Uid, ex.jobUser.Gid, 0o755); err != nil { return fmt.Errorf("create working directory: %w", err) } cmd.Dir = ex.jobWorkingDir - // User must be already set - user := ex.jobSpec.User // Strictly speaking, we need CAP_SETUID and CAP_GUID (for Cmd.Start()-> // Cmd.SysProcAttr.Credential) and CAP_CHOWN (for startCommand()->os.Chown()), // but for the sake of simplicity we instead check if we are root or not - if ex.currentUid == 0 { - log.Trace( - ctx, "Using credentials", - "uid", *user.Uid, "gid", *user.Gid, "groups", user.GroupIds, - "username", user.GetUsername(), "groupname", user.GetGroupname(), - "home", user.HomeDir, - ) + if ex.currentUser.IsRoot() { + log.Trace(ctx, "Using credentials", "user", ex.jobUser) if cmd.SysProcAttr == nil { cmd.SysProcAttr = &syscall.SysProcAttr{} } - cmd.SysProcAttr.Credential = &syscall.Credential{ - Uid: *user.Uid, - Gid: *user.Gid, - Groups: user.GroupIds, + creds, err := ex.jobUser.ProcessCredentials() + if err != nil { + return fmt.Errorf("prepare process credentials: %w", err) } + cmd.SysProcAttr.Credential = creds } else { - log.Info(ctx, "Current user is not root, cannot set process credentials", "uid", ex.currentUid) + log.Info(ctx, "Current user is not root, cannot set process credentials", "user", ex.currentUser) } envMap := NewEnvMap(ParseEnvList(os.Environ()), jobEnvs, ex.secrets) @@ -466,54 +493,11 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error log.Warning(ctx, "failed to include dstack_profile", "path", profilePath, "err", err) } - // As of 2024-11-29, ex.homeDir is always set to /root - if _, err := prepareSSHDir(-1, -1, ex.homeDir); err != nil { - log.Warning(ctx, "failed to prepare ssh dir", "home", ex.homeDir, "err", err) - } - userSSHDir := "" - uid := -1 - gid := -1 - if user != nil && *user.Uid != 0 { - // non-root user - uid = int(*user.Uid) - gid = int(*user.Gid) - homeDir, isHomeDirAccessible := prepareHomeDir(ctx, uid, gid, user.HomeDir) - envMap["HOME"] = homeDir - if isHomeDirAccessible { - log.Trace(ctx, "provisioning homeDir", "path", homeDir) - userSSHDir, err = prepareSSHDir(uid, gid, homeDir) - if err != nil { - log.Warning(ctx, "failed to prepare ssh dir", "home", homeDir, "err", err) - } - } else { - log.Trace(ctx, "homeDir is not accessible, skipping provisioning", "path", homeDir) - } - } else { - // root user - envMap["HOME"] = ex.homeDir - userSSHDir = filepath.Join(ex.homeDir, ".ssh") - } - - if ex.jobSpec.SSHKey != nil && userSSHDir != "" { - err := configureSSH( - ex.jobSpec.SSHKey.Private, ex.clusterInfo.JobIPs, ex.sshd.Port(), - uid, gid, userSSHDir, - ) - if err == nil { - err = ex.sshd.AddAuthorizedKeys(ctx, ex.jobSpec.SSHKey.Public) - } - if err != nil { - log.Warning(ctx, "failed to configure SSH", "err", err) - } - } - err = writeMpiHostfile(ctx, ex.clusterInfo.JobIPs, gpusPerNodeNum, mpiHostfilePath) if err != nil { return fmt.Errorf("write MPI hostfile: %w", err) } - cmd.Env = envMap.Render() - // Configure process resource limits // TODO: Make rlimits customizable in the run configuration. Currently, we only set max locked memory // to unlimited to fix the issue with InfiniBand/RDMA: "Cannot allocate memory". @@ -529,6 +513,10 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error log.Error(ctx, "Failed to set resource limits", "err", err) } + // HOME must be added after writeDstackProfile to avoid overriding the correct per-user value set by sshd + envMap["HOME"] = ex.jobUser.HomeDir + cmd.Env = envMap.Render() + log.Trace(ctx, "Starting exec", "cmd", cmd.String(), "working_dir", cmd.Dir, "env", cmd.Env) ptm, err := startCommand(cmd) @@ -551,26 +539,32 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error return nil } -func (ex *RunExecutor) setupCredentials(ctx context.Context) (func(), error) { +// setupGitCredentials must be called from Run after setJobUser +func (ex *RunExecutor) setupGitCredentials(ctx context.Context) (func(), error) { if ex.repoCredentials == nil { return func() {}, nil } + switch ex.repoCredentials.GetProtocol() { case "ssh": if ex.repoCredentials.PrivateKey == nil { return nil, fmt.Errorf("private key is missing") } - keyPath := filepath.Join(ex.homeDir, ".ssh/id_rsa") + sshDir, err := prepareUserSshDir(ex.jobUser) + if err != nil { + return nil, fmt.Errorf("prepare user ssh dir: %w", err) + } + keyPath := filepath.Join(sshDir, "id_rsa") if _, err := os.Stat(keyPath); err == nil { return nil, fmt.Errorf("private key already exists") } - if err := os.MkdirAll(filepath.Dir(keyPath), 0o700); err != nil { - return nil, fmt.Errorf("create ssh directory: %w", err) - } log.Info(ctx, "Writing private key", "path", keyPath) if err := os.WriteFile(keyPath, []byte(*ex.repoCredentials.PrivateKey), 0o600); err != nil { return nil, fmt.Errorf("write private key: %w", err) } + if err := os.Chown(keyPath, ex.jobUser.Uid, ex.jobUser.Gid); err != nil { + return nil, fmt.Errorf("chown private key: %w", err) + } return func() { log.Info(ctx, "Removing private key", "path", keyPath) _ = os.Remove(keyPath) @@ -579,11 +573,11 @@ func (ex *RunExecutor) setupCredentials(ctx context.Context) (func(), error) { if ex.repoCredentials.OAuthToken == nil { return func() {}, nil } - hostsPath := filepath.Join(ex.homeDir, ".config/gh/hosts.yml") + hostsPath := filepath.Join(ex.jobUser.HomeDir, ".config/gh/hosts.yml") if _, err := os.Stat(hostsPath); err == nil { return nil, fmt.Errorf("hosts.yml file already exists") } - if err := os.MkdirAll(filepath.Dir(hostsPath), 0o700); err != nil { + if err := common.MkdirAll(ctx, filepath.Dir(hostsPath), ex.jobUser.Uid, ex.jobUser.Gid, 0o700); err != nil { return nil, fmt.Errorf("create gh config directory: %w", err) } log.Info(ctx, "Writing OAuth token", "path", hostsPath) @@ -595,6 +589,9 @@ func (ex *RunExecutor) setupCredentials(ctx context.Context) (func(), error) { if err := os.WriteFile(hostsPath, []byte(ghHost), 0o600); err != nil { return nil, fmt.Errorf("write OAuth token: %w", err) } + if err := os.Chown(hostsPath, ex.jobUser.Uid, ex.jobUser.Gid); err != nil { + return nil, fmt.Errorf("chown OAuth token: %w", err) + } return func() { log.Info(ctx, "Removing OAuth token", "path", hostsPath) _ = os.Remove(hostsPath) @@ -643,104 +640,6 @@ func buildLDLibraryPathEnv(ctx context.Context) (string, error) { return currentLDPath, nil } -// fillUser fills missing User fields -// Since normally only one kind of identifier is set (either id or name), we don't check -// (id, name) pair consistency -- id has higher priority and overwites name with a real -// name, ignoring the already set name value (if any) -// HomeDir and SupplementaryGroupIds are always set unconditionally, as they are not -// provided by the dstack server -func fillUser(user *schemas.User) error { - if user.Uid == nil && user.Username == nil { - return errors.New("neither Uid nor Username is set") - } - - if user.Gid == nil && user.Groupname != nil { - osGroup, err := osuser.LookupGroup(*user.Groupname) - if err != nil { - return fmt.Errorf("failed to look up group by Groupname: %w", err) - } - gid, err := parseStringId(osGroup.Gid) - if err != nil { - return fmt.Errorf("failed to parse group Gid: %w", err) - } - user.Gid = &gid - } - - var osUser *osuser.User - - if user.Uid == nil { - var err error - osUser, err = osuser.Lookup(*user.Username) - if err != nil { - return fmt.Errorf("failed to look up user by Username: %w", err) - } - uid, err := parseStringId(osUser.Uid) - if err != nil { - return fmt.Errorf("failed to parse Uid: %w", err) - } - user.Uid = &uid - } else { - var err error - osUser, err = osuser.LookupId(strconv.Itoa(int(*user.Uid))) - if err != nil { - var notFoundErr osuser.UnknownUserIdError - if !errors.As(err, ¬FoundErr) { - return fmt.Errorf("failed to look up user by Uid: %w", err) - } - } - } - - if osUser != nil { - user.Username = &osUser.Username - user.HomeDir = osUser.HomeDir - } else { - user.Username = nil - user.HomeDir = "" - } - - // If Gid is not set, either directly or via Groupname, use user's primary group - // and supplementary groups, see https://docs.docker.com/reference/dockerfile/#user - // If user doesn't exist, set Gid to 0 and supplementary groups to an empty list - if user.Gid == nil { - if osUser != nil { - gid, err := parseStringId(osUser.Gid) - if err != nil { - return fmt.Errorf("failed to parse primary Gid: %w", err) - } - user.Gid = &gid - groupStringIds, err := osUser.GroupIds() - if err != nil { - return fmt.Errorf("failed to get supplementary groups: %w", err) - } - var groupIds []uint32 - for _, groupStringId := range groupStringIds { - groupId, err := parseStringId(groupStringId) - if err != nil { - return fmt.Errorf("failed to parse supplementary group id: %w", err) - } - groupIds = append(groupIds, groupId) - } - user.GroupIds = groupIds - } else { - var fallbackGid uint32 = 0 - user.Gid = &fallbackGid - user.GroupIds = []uint32{} - } - } - return nil -} - -func parseStringId(stringId string) (uint32, error) { - id, err := strconv.ParseInt(stringId, 10, 32) - if err != nil { - return 0, err - } - if id < 0 { - return 0, fmt.Errorf("negative id value: %d", id) - } - return uint32(id), nil -} - // A simplified copypasta of creack/pty Start->StartWithSize->StartWithAttrs // with two additions: // * controlling terminal is properly set (cmd.Extrafiles, Cmd.SysProcAttr.Ctty) @@ -784,55 +683,24 @@ func startCommand(cmd *exec.Cmd) (*os.File, error) { return ptm, nil } -func prepareHomeDir(ctx context.Context, uid int, gid int, homeDir string) (string, bool) { - if homeDir == "" { - // user does not exist - return "/", false - } - if info, err := os.Stat(homeDir); errors.Is(err, os.ErrNotExist) { - if strings.Contains(homeDir, "nonexistent") { - // let `/nonexistent` stay non-existent - return homeDir, false - } - if err = os.MkdirAll(homeDir, 0o755); err != nil { - log.Warning(ctx, "failed to create homeDir", "err", err) - return homeDir, false - } - if err = os.Chmod(homeDir, 0o750); err != nil { - log.Warning(ctx, "failed to chmod homeDir", "err", err) - } - if err = os.Chown(homeDir, uid, gid); err != nil { - log.Warning(ctx, "failed to chown homeDir", "err", err) - } - return homeDir, true - } else if err != nil { - log.Warning(ctx, "homeDir is not accessible", "err", err) - return homeDir, false - } else if !info.IsDir() { - log.Warning(ctx, "HomeDir is not a dir", "path", homeDir) - return homeDir, false - } - return homeDir, true -} - -func prepareSSHDir(uid int, gid int, homeDir string) (string, error) { - sshDir := filepath.Join(homeDir, ".ssh") +func prepareUserSshDir(user *linuxuser.User) (string, error) { + sshDir := filepath.Join(user.HomeDir, ".ssh") info, err := os.Stat(sshDir) if err == nil { if !info.IsDir() { return "", fmt.Errorf("not a directory: %s", sshDir) } - if err = os.Chmod(sshDir, 0o700); err != nil { + if err := os.Chmod(sshDir, 0o700); err != nil { return "", fmt.Errorf("chmod ssh dir: %w", err) } } else if errors.Is(err, os.ErrNotExist) { - if err = os.MkdirAll(sshDir, 0o700); err != nil { + if err := os.MkdirAll(sshDir, 0o700); err != nil { return "", fmt.Errorf("create ssh dir: %w", err) } } else { return "", err } - if err = os.Chown(sshDir, uid, gid); err != nil { + if err := os.Chown(sshDir, user.Uid, user.Gid); err != nil { return "", fmt.Errorf("chown ssh dir: %w", err) } return sshDir, nil @@ -915,43 +783,3 @@ func includeDstackProfile(profilePath string, dstackProfilePath string) error { } return nil } - -func configureSSH(private string, ips []string, port int, uid int, gid int, sshDir string) error { - privatePath := filepath.Join(sshDir, "dstack_job") - privateFile, err := os.OpenFile(privatePath, os.O_TRUNC|os.O_WRONLY|os.O_CREATE, 0o600) - if err != nil { - return fmt.Errorf("open private key file: %w", err) - } - defer privateFile.Close() - if err := os.Chown(privatePath, uid, gid); err != nil { - return fmt.Errorf("chown private key: %w", err) - } - if _, err := privateFile.WriteString(private); err != nil { - return fmt.Errorf("write private key: %w", err) - } - - // TODO: move job hosts config to ~/.dstack/ssh/config.d/current_job - // and add "Include ~/.dstack/ssh/config.d/*" directive to ~/.ssh/config if not present - // instead of appending job hosts config directly (don't bloat user's ssh_config) - configPath := filepath.Join(sshDir, "config") - configFile, err := os.OpenFile(configPath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0o600) - if err != nil { - return fmt.Errorf("open SSH config: %w", err) - } - defer configFile.Close() - if err := os.Chown(configPath, uid, gid); err != nil { - return fmt.Errorf("chown SSH config: %w", err) - } - var configBuffer bytes.Buffer - for _, ip := range ips { - configBuffer.WriteString(fmt.Sprintf("\nHost %s\n", ip)) - configBuffer.WriteString(fmt.Sprintf(" Port %d\n", port)) - configBuffer.WriteString(" StrictHostKeyChecking no\n") - configBuffer.WriteString(" UserKnownHostsFile /dev/null\n") - configBuffer.WriteString(fmt.Sprintf(" IdentityFile %s\n", privatePath)) - } - if _, err := configFile.Write(configBuffer.Bytes()); err != nil { - return fmt.Errorf("write SSH config: %w", err) - } - return nil -} diff --git a/runner/internal/executor/executor_test.go b/runner/internal/executor/executor_test.go index 0d935dd642..105493e301 100644 --- a/runner/internal/executor/executor_test.go +++ b/runner/internal/executor/executor_test.go @@ -14,6 +14,7 @@ import ( "testing" "time" + linuxuser "github.com/dstackai/dstack/runner/internal/linux/user" "github.com/dstackai/dstack/runner/internal/schemas" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -63,7 +64,7 @@ func TestExecutor_HomeDir(t *testing.T) { err := ex.execJob(t.Context(), io.Writer(&b)) assert.NoError(t, err) - assert.Equal(t, ex.homeDir+"\n", strings.ReplaceAll(b.String(), "\r\n", "\n")) + assert.Equal(t, ex.currentUser.HomeDir+"\n", strings.ReplaceAll(b.String(), "\r\n", "\n")) } func TestExecutor_NonZeroExit(t *testing.T) { @@ -90,7 +91,7 @@ func TestExecutor_SSHCredentials(t *testing.T) { PrivateKey: &key, } - clean, err := ex.setupCredentials(t.Context()) + clean, err := ex.setupGitCredentials(t.Context()) defer clean() require.NoError(t, err) @@ -206,14 +207,23 @@ func makeTestExecutor(t *testing.T) *RunExecutor { tempDir := filepath.Join(baseDir, "temp") require.NoError(t, os.Mkdir(tempDir, 0o700)) - homeDir := filepath.Join(baseDir, "home") - require.NoError(t, os.Mkdir(homeDir, 0o700)) + dstackDir := filepath.Join(baseDir, "dstack") require.NoError(t, os.Mkdir(dstackDir, 0o755)) - ex, err := NewRunExecutor(tempDir, homeDir, dstackDir, new(sshdMock)) + + currentUser, err := linuxuser.FromCurrentProcess() + require.NoError(t, err) + homeDir := filepath.Join(baseDir, "home") + require.NoError(t, os.Mkdir(homeDir, 0o700)) + currentUser.HomeDir = homeDir + + ex, err := NewRunExecutor(tempDir, dstackDir, *currentUser, new(sshdMock)) require.NoError(t, err) + ex.SetJob(body) + require.NoError(t, ex.setJobUser(t.Context())) require.NoError(t, ex.setJobWorkingDir(t.Context())) + return ex } diff --git a/runner/internal/executor/files.go b/runner/internal/executor/files.go index ee1170c418..6b992ce2c1 100644 --- a/runner/internal/executor/files.go +++ b/runner/internal/executor/files.go @@ -34,19 +34,22 @@ func (ex *RunExecutor) WriteFileArchive(id string, src io.Reader) error { return nil } -// setupFiles must be called from Run -// Must be called after setJobWorkingDir and setJobCredentials +// setupFiles must be called from Run after setJobUser and setJobWorkingDir func (ex *RunExecutor) setupFiles(ctx context.Context) error { log.Trace(ctx, "Setting up files") if ex.jobWorkingDir == "" { - return errors.New("setup files: working dir is not set") + return errors.New("working dir is not set") } if !filepath.IsAbs(ex.jobWorkingDir) { - return fmt.Errorf("setup files: working dir must be absolute: %s", ex.jobWorkingDir) + return fmt.Errorf("working dir must be absolute: %s", ex.jobWorkingDir) } for _, fa := range ex.jobSpec.FileArchives { archivePath := path.Join(ex.fileArchiveDir, fa.Id) - if err := extractFileArchive(ctx, archivePath, fa.Path, ex.jobWorkingDir, ex.jobUid, ex.jobGid, ex.jobHomeDir); err != nil { + err := extractFileArchive( + ctx, archivePath, fa.Path, ex.jobWorkingDir, ex.jobUser.HomeDir, + ex.jobUser.Uid, ex.jobUser.Gid, + ) + if err != nil { return fmt.Errorf("extract file archive %s: %w", fa.Id, err) } } @@ -56,7 +59,7 @@ func (ex *RunExecutor) setupFiles(ctx context.Context) error { return nil } -func extractFileArchive(ctx context.Context, archivePath string, destPath string, baseDir string, uid int, gid int, homeDir string) error { +func extractFileArchive(ctx context.Context, archivePath string, destPath string, baseDir string, homeDir string, uid int, gid int) error { log.Trace(ctx, "Extracting file archive", "archive", archivePath, "dest", destPath, "base", baseDir, "home", homeDir) destPath, err := common.ExpandPath(destPath, baseDir, homeDir) @@ -64,7 +67,7 @@ func extractFileArchive(ctx context.Context, archivePath string, destPath string return fmt.Errorf("expand destination path: %w", err) } destBase, destName := path.Split(destPath) - if err := common.MkdirAll(ctx, destBase, uid, gid); err != nil { + if err := common.MkdirAll(ctx, destBase, uid, gid, 0o755); err != nil { return fmt.Errorf("create destination directory: %w", err) } if err := os.RemoveAll(destPath); err != nil { @@ -88,11 +91,9 @@ func extractFileArchive(ctx context.Context, archivePath string, destPath string return fmt.Errorf("extract tar archive: %w", err) } - if uid != -1 || gid != -1 { - for _, p := range paths { - if err := os.Chown(path.Join(destBase, p), uid, gid); err != nil { - log.Warning(ctx, "Failed to chown", "path", p, "err", err) - } + for _, p := range paths { + if err := os.Chown(path.Join(destBase, p), uid, gid); err != nil { + log.Warning(ctx, "Failed to chown", "path", p, "err", err) } } diff --git a/runner/internal/executor/repo.go b/runner/internal/executor/repo.go index 2f757f63c6..467c783a88 100644 --- a/runner/internal/executor/repo.go +++ b/runner/internal/executor/repo.go @@ -36,22 +36,21 @@ func (ex *RunExecutor) WriteRepoBlob(src io.Reader) error { return nil } -// setupRepo must be called from Run -// Must be called after setJobWorkingDir and setJobCredentials +// setupRepo must be called from Run after setJobUser and setJobWorkingDir func (ex *RunExecutor) setupRepo(ctx context.Context) error { log.Trace(ctx, "Setting up repo") if ex.jobWorkingDir == "" { - return errors.New("setup repo: working dir is not set") + return errors.New("working dir is not set") } if !filepath.IsAbs(ex.jobWorkingDir) { - return fmt.Errorf("setup repo: working dir must be absolute: %s", ex.jobWorkingDir) + return fmt.Errorf("working dir must be absolute: %s", ex.jobWorkingDir) } if ex.jobSpec.RepoDir == nil { - return errors.New("repo_dir is not set") + return errors.New("repo dir is not set") } var err error - ex.repoDir, err = common.ExpandPath(*ex.jobSpec.RepoDir, ex.jobWorkingDir, ex.jobHomeDir) + ex.repoDir, err = common.ExpandPath(*ex.jobSpec.RepoDir, ex.jobWorkingDir, ex.jobUser.HomeDir) if err != nil { return fmt.Errorf("expand repo dir path: %w", err) } @@ -71,12 +70,12 @@ func (ex *RunExecutor) setupRepo(ctx context.Context) error { } switch repoExistsAction { case schemas.RepoExistsActionError: - return fmt.Errorf("setup repo: repo dir is not empty: %s", ex.repoDir) + return fmt.Errorf("repo dir is not empty: %s", ex.repoDir) case schemas.RepoExistsActionSkip: log.Info(ctx, "Skipping repo checkout: repo dir is not empty", "path", ex.repoDir) return nil default: - return fmt.Errorf("setup repo: unsupported action: %s", repoExistsAction) + return fmt.Errorf("unsupported action: %s", repoExistsAction) } } @@ -237,9 +236,6 @@ func (ex *RunExecutor) restoreRepoDir(ctx context.Context, tmpDir string) error func (ex *RunExecutor) chownRepoDir(ctx context.Context) error { log.Trace(ctx, "Chowning repo dir") - if ex.jobUid == -1 && ex.jobGid == -1 { - return nil - } return filepath.WalkDir( ex.repoDir, func(p string, d fs.DirEntry, err error) error { @@ -248,7 +244,7 @@ func (ex *RunExecutor) chownRepoDir(ctx context.Context) error { log.Debug(ctx, "Error while walking repo dir", "path", p, "err", err) return nil } - if err := os.Chown(p, ex.jobUid, ex.jobGid); err != nil { + if err := os.Chown(p, ex.jobUser.Uid, ex.jobUser.Gid); err != nil { log.Debug(ctx, "Error while chowning repo dir", "path", p, "err", err) } return nil diff --git a/runner/internal/executor/user.go b/runner/internal/executor/user.go new file mode 100644 index 0000000000..30affda617 --- /dev/null +++ b/runner/internal/executor/user.go @@ -0,0 +1,184 @@ +package executor + +import ( + "context" + "errors" + "fmt" + "os" + osuser "os/user" + "path" + "strconv" + "strings" + + linuxuser "github.com/dstackai/dstack/runner/internal/linux/user" + "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/schemas" +) + +func (ex *RunExecutor) setJobUser(ctx context.Context) error { + if ex.jobSpec.User == nil { + // JobSpec.User is nil if the user is not specified either in the dstack configuration + // (the `user` property) or in the image (the `USER` Dockerfile instruction). + // In such cases, the root user should be used as a fallback, and we use the current user, + // assuming that the runner is started by root. + ex.jobUser = &ex.currentUser + } else { + jobUser, err := jobUserFromJobSpecUser( + ex.jobSpec.User, + osuser.LookupId, osuser.Lookup, + osuser.LookupGroup, (*osuser.User).GroupIds, + ) + if err != nil { + return fmt.Errorf("job user from job spec: %w", err) + } + ex.jobUser = jobUser + } + + if err := checkHomeDir(ex.jobUser.HomeDir); err != nil { + log.Warning(ctx, "Error while checking job user home dir, using / instead", "err", err) + ex.jobUser.HomeDir = "/" + } + + log.Trace(ctx, "Job user", "user", ex.jobUser) + return nil +} + +func jobUserFromJobSpecUser( + jobSpecUser *schemas.User, + userLookupIdFunc func(string) (*osuser.User, error), + userLookupNameFunc func(string) (*osuser.User, error), + groupLookupNameFunc func(string) (*osuser.Group, error), + userGroupIdsFunc func(*osuser.User) ([]string, error), +) (*linuxuser.User, error) { + if jobSpecUser.Uid == nil && jobSpecUser.Username == nil { + return nil, errors.New("neither uid nor username is set") + } + + var err error + var osUser *osuser.User + + // -1 is a placeholder value, the actual value must be >= 0 + //nolint:ineffassign + uid := -1 + if jobSpecUser.Uid != nil { + uid = int(*jobSpecUser.Uid) + osUser, err = userLookupIdFunc(strconv.Itoa(uid)) + if err != nil { + var notFoundErr osuser.UnknownUserIdError + if !errors.As(err, ¬FoundErr) { + return nil, fmt.Errorf("lookup user by id: %w", err) + } + } + } else { + osUser, err = userLookupNameFunc(*jobSpecUser.Username) + if err != nil { + return nil, fmt.Errorf("lookup user by name: %w", err) + } + uid, err = parseStringId(osUser.Uid) + if err != nil { + return nil, fmt.Errorf("parse user id: %w", err) + } + } + if uid == -1 { + // Assertion, should never occur + return nil, errors.New("failed to infer user id") + } + + // -1 is a placeholder value, the actual value must be >= 0 + //nolint:ineffassign + gid := -1 + // Must include at least one gid, see len(gids) == 0 assertion below + var gids []int + if jobSpecUser.Gid != nil { + gid = int(*jobSpecUser.Gid) + // Here and below: + // > Note that when specifying a group for the user, the user will have + // > only the specified group membership. + // > Any other configured group memberships will be ignored. + // See: https://docs.docker.com/reference/dockerfile/#user + gids = []int{gid} + } else if jobSpecUser.Groupname != nil { + osGroup, err := groupLookupNameFunc(*jobSpecUser.Groupname) + if err != nil { + return nil, fmt.Errorf("lookup group by name: %w", err) + } + gid, err = parseStringId(osGroup.Gid) + if err != nil { + return nil, fmt.Errorf("parse group id: %w", err) + } + gids = []int{gid} + } else if osUser != nil { + gid, err = parseStringId(osUser.Gid) + if err != nil { + return nil, fmt.Errorf("parse group id: %w", err) + } + rawGids, err := userGroupIdsFunc(osUser) + if err != nil { + return nil, fmt.Errorf("get user supplementary group ids: %w", err) + } + // [main_gid, supplementary_gid_1, supplementary_gid_2, ...] + gids = make([]int, len(rawGids)+1) + gids[0] = gid + for index, rawGid := range rawGids { + supplementaryGid, err := parseStringId(rawGid) + if err != nil { + return nil, fmt.Errorf("parse supplementary group id: %w", err) + } + gids[index+1] = supplementaryGid + } + } else { + // > When the user doesn't have a primary group then the image + // > (or the next instructions) will be run with the root group. + // See: https://docs.docker.com/reference/dockerfile/#user + gid = 0 + gids = []int{gid} + } + if gid == -1 { + // Assertion, should never occur + return nil, errors.New("failed to infer group id") + } + if len(gids) == 0 { + // Assertion, should never occur + return nil, errors.New("failed to infer supplementary group ids") + } + + username := "" + homeDir := "" + if osUser != nil { + username = osUser.Username + homeDir = osUser.HomeDir + } + + return linuxuser.NewUser(uid, gid, gids, username, homeDir), nil +} + +func parseStringId(stringId string) (int, error) { + id, err := strconv.Atoi(stringId) + if err != nil { + return 0, err + } + if id < 0 { + return 0, fmt.Errorf("negative id value: %d", id) + } + return id, nil +} + +func checkHomeDir(homeDir string) error { + if homeDir == "" { + return errors.New("not set") + } + if !path.IsAbs(homeDir) { + return fmt.Errorf("must be absolute: %s", homeDir) + } + if info, err := os.Stat(homeDir); errors.Is(err, os.ErrNotExist) { + if strings.Contains(homeDir, "nonexistent") { + // let `/nonexistent` stay non-existent + return fmt.Errorf("non-existent: %s", homeDir) + } + } else if err != nil { + return err + } else if !info.IsDir() { + return fmt.Errorf("not a directory: %s", homeDir) + } + return nil +} diff --git a/runner/internal/executor/user_test.go b/runner/internal/executor/user_test.go new file mode 100644 index 0000000000..2bc6a19d87 --- /dev/null +++ b/runner/internal/executor/user_test.go @@ -0,0 +1,232 @@ +package executor + +import ( + "errors" + osuser "os/user" + "strconv" + "testing" + + "github.com/stretchr/testify/require" + + linuxuser "github.com/dstackai/dstack/runner/internal/linux/user" + "github.com/dstackai/dstack/runner/internal/schemas" +) + +var shouldNotBeCalledErr = errors.New("this function should not be called") + +func unknownUserIdError(t *testing.T, strUid string) osuser.UnknownUserIdError { + t.Helper() + uid, err := strconv.Atoi(strUid) + require.NoError(t, err) + return osuser.UnknownUserIdError(uid) +} + +func TestJobUserFromJobSpecUser_Uid_UserDoesNotExist(t *testing.T) { + specUid := uint32(2000) + specUser := schemas.User{Uid: &specUid} + expectedUser := linuxuser.User{ + Uid: 2000, + Gid: 0, + Gids: []int{0}, + Username: "", + HomeDir: "", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return nil, unknownUserIdError(t, id) }, + func(name string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.Group, error) { return nil, shouldNotBeCalledErr }, + func(*osuser.User) ([]string, error) { return nil, shouldNotBeCalledErr }, + ) + + require.NoError(t, err) + require.Equal(t, expectedUser, *user) +} + +func TestJobUserFromJobSpecUser_Uid_Gid_UserDoesNotExist(t *testing.T) { + specUid := uint32(2000) + specGid := uint32(200) + specUser := schemas.User{Uid: &specUid, Gid: &specGid} + expectedUser := linuxuser.User{ + Uid: 2000, + Gid: 200, + Gids: []int{200}, + Username: "", + HomeDir: "", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return nil, unknownUserIdError(t, id) }, + func(name string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.Group, error) { return nil, shouldNotBeCalledErr }, + func(*osuser.User) ([]string, error) { return nil, shouldNotBeCalledErr }, + ) + + require.NoError(t, err) + require.Equal(t, expectedUser, *user) +} + +func TestJobUserFromJobSpecUser_Uid_UserExists(t *testing.T) { + specUid := uint32(2000) + specUser := schemas.User{Uid: &specUid} + osUser := osuser.User{ + Uid: "2000", + Gid: "300", + Username: "testuser", + HomeDir: "/home/testuser", + } + osUserGids := []string{"300", "400", "500"} + expectedUser := linuxuser.User{ + Uid: 2000, + Gid: 300, + Gids: []int{300, 400, 500}, + Username: "testuser", + HomeDir: "/home/testuser", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(uid string) (*osuser.User, error) { return &osUser, nil }, + func(name string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(gid string) (*osuser.Group, error) { return nil, shouldNotBeCalledErr }, + func(*osuser.User) ([]string, error) { return osUserGids, nil }, + ) + + require.NoError(t, err) + require.Equal(t, expectedUser, *user) +} + +func TestJobUserFromJobSpecUser_Uid_Gid_UserExists(t *testing.T) { + specUid := uint32(2000) + specGid := uint32(200) + specUser := schemas.User{Uid: &specUid, Gid: &specGid} + osUser := osuser.User{ + Uid: "2000", + Gid: "300", + Username: "testuser", + HomeDir: "/home/testuser", + } + expectedUser := linuxuser.User{ + Uid: 2000, + Gid: 200, + Gids: []int{200}, + Username: "testuser", + HomeDir: "/home/testuser", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return &osUser, nil }, + func(name string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.Group, error) { return nil, shouldNotBeCalledErr }, + func(*osuser.User) ([]string, error) { return nil, shouldNotBeCalledErr }, + ) + + require.NoError(t, err) + require.Equal(t, expectedUser, *user) +} + +func TestJobUserFromJobSpecUser_Username_UserDoesNotExist(t *testing.T) { + specUsername := "unknownuser" + specUser := schemas.User{Username: &specUsername} + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.User, error) { return nil, osuser.UnknownUserError(name) }, + func(name string) (*osuser.Group, error) { return nil, shouldNotBeCalledErr }, + func(*osuser.User) ([]string, error) { return nil, shouldNotBeCalledErr }, + ) + + require.ErrorContains(t, err, "lookup user by name") + require.Nil(t, user) +} + +func TestJobUserFromJobSpecUser_Username_UserExists(t *testing.T) { + specUsername := "testnuser" + specUser := schemas.User{Username: &specUsername} + osUser := osuser.User{ + Uid: "2000", + Gid: "300", + Username: "testuser", + HomeDir: "/home/testuser", + } + osUserGids := []string{"300", "400", "500"} + expectedUser := linuxuser.User{ + Uid: 2000, + Gid: 300, + Gids: []int{300, 400, 500}, + Username: "testuser", + HomeDir: "/home/testuser", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.User, error) { return &osUser, nil }, + func(name string) (*osuser.Group, error) { return nil, shouldNotBeCalledErr }, + func(*osuser.User) ([]string, error) { return osUserGids, nil }, + ) + + require.NoError(t, err) + require.Equal(t, expectedUser, *user) +} + +func TestJobUserFromJobSpecUser_Username_Groupname_UserExists_GroupExists(t *testing.T) { + specUsername := "testnuser" + specGroupname := "testgroup" + specUser := schemas.User{Username: &specUsername, Groupname: &specGroupname} + osUser := osuser.User{ + Uid: "2000", + Gid: "300", + Username: "testuser", + HomeDir: "/home/testuser", + } + osGroup := osuser.Group{ + Gid: "200", + Name: specGroupname, + } + expectedUser := linuxuser.User{ + Uid: 2000, + Gid: 200, + Gids: []int{200}, + Username: "testuser", + HomeDir: "/home/testuser", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.User, error) { return &osUser, nil }, + func(name string) (*osuser.Group, error) { return &osGroup, nil }, + func(*osuser.User) ([]string, error) { return nil, shouldNotBeCalledErr }, + ) + + require.NoError(t, err) + require.Equal(t, expectedUser, *user) +} + +func TestJobUserFromJobSpecUser_Username_Groupname_UserExists_GroupDoesNotExist(t *testing.T) { + specUsername := "testnuser" + specGroupname := "testgroup" + specUser := schemas.User{Username: &specUsername, Groupname: &specGroupname} + osUser := osuser.User{ + Uid: "2000", + Gid: "300", + Username: "testuser", + HomeDir: "/home/testuser", + } + + user, err := jobUserFromJobSpecUser( + &specUser, + func(id string) (*osuser.User, error) { return nil, shouldNotBeCalledErr }, + func(name string) (*osuser.User, error) { return &osUser, nil }, + func(name string) (*osuser.Group, error) { return nil, osuser.UnknownGroupError(name) }, + func(*osuser.User) ([]string, error) { return nil, shouldNotBeCalledErr }, + ) + + require.ErrorContains(t, err, "lookup group by name") + require.Nil(t, user) +} diff --git a/runner/internal/linux/user/user.go b/runner/internal/linux/user/user.go new file mode 100644 index 0000000000..caecc1324f --- /dev/null +++ b/runner/internal/linux/user/user.go @@ -0,0 +1,96 @@ +// Despite this package is being located inside the linux package, it should work on any Unix-like system. +package user + +import ( + "fmt" + osuser "os/user" + "slices" + "strconv" + "syscall" +) + +// User represents the user part of process `credentials(7)` +// (real user ID, real group ID, supplementary group IDs) enriched with +// some info from the user database `passwd(5)` (login name, home dir). +// Note, unlike the User struct from os/user, User does not necessarily +// correspond to any existing user account, for example, any of IDs may not exist +// in passwd(5) or group(5) databases at all or the user may not belong to +// the primary group or any of the specified supplementary groups. +type User struct { + // Real user ID + Uid int + // Real group ID + Gid int + // Supplementary group IDs. The primary group should be always included and + // the resulting list should be sorted in ascending order with duplicates removed; + // NewUser() performs such normalization + Gids []int + // May be empty, e.g., if the user does not exist + Username string + // May be Empty, e.g., if the user does not exist + HomeDir string +} + +func (u *User) String() string { + // The format is inspired by `id(1)` + formattedUsername := "" + if u.Username != "" { + formattedUsername = fmt.Sprintf("(%s)", u.Username) + } + return fmt.Sprintf("uid=%d%s gid=%d groups=%v home=%s", u.Uid, formattedUsername, u.Gid, u.Gids, u.HomeDir) +} + +func (u *User) ProcessCredentials() (*syscall.Credential, error) { + if u.Uid < 0 { + return nil, fmt.Errorf("negative user id: %d", u.Uid) + } + if u.Gid < 0 { + return nil, fmt.Errorf("negative group id: %d", u.Gid) + } + groups := make([]uint32, len(u.Gids)) + for index, gid := range u.Gids { + if gid < 0 { + return nil, fmt.Errorf("negative supplementary group id: %d", gid) + } + groups[index] = uint32(gid) + } + creds := syscall.Credential{ + Uid: uint32(u.Uid), + Gid: uint32(u.Gid), + Groups: groups, + } + return &creds, nil +} + +func (u *User) IsRoot() bool { + return u.Uid == 0 +} + +func NewUser(uid int, gid int, gids []int, username string, homeDir string) *User { + normalizedGids := append([]int{gid}, gids...) + slices.Sort(normalizedGids) + normalizedGids = slices.Compact(normalizedGids) + return &User{ + Uid: uid, + Gid: gid, + Gids: normalizedGids, + Username: username, + HomeDir: homeDir, + } +} + +func FromCurrentProcess() (*User, error) { + uid := syscall.Getuid() + gid := syscall.Getgid() + gids, err := syscall.Getgroups() + if err != nil { + return nil, fmt.Errorf("get supplementary groups: %w", err) + } + username := "" + homeDir := "" + if osUser, err := osuser.LookupId(strconv.Itoa(uid)); err == nil { + username = osUser.Username + homeDir = osUser.HomeDir + } + return NewUser(uid, gid, gids, username, homeDir), nil +} diff --git a/runner/internal/schemas/schemas.go b/runner/internal/schemas/schemas.go index 106bc61f87..152637decc 100644 --- a/runner/internal/schemas/schemas.go +++ b/runner/internal/schemas/schemas.go @@ -124,22 +124,6 @@ type User struct { Username *string `json:"username"` Gid *uint32 `json:"gid"` Groupname *string `json:"groupname"` - GroupIds []uint32 - HomeDir string -} - -func (u *User) GetUsername() string { - if u.Username == nil { - return "" - } - return *u.Username -} - -func (u *User) GetGroupname() string { - if u.Groupname == nil { - return "" - } - return *u.Groupname } type HealthcheckResponse struct { diff --git a/runner/internal/shim/docker.go b/runner/internal/shim/docker.go index 7e29e92dd7..1fd8d959af 100644 --- a/runner/internal/shim/docker.go +++ b/runner/internal/shim/docker.go @@ -927,8 +927,6 @@ func getSSHShellCommands() []string { `unset LD_LIBRARY_PATH && unset LD_PRELOAD`, // common functions `exists() { command -v "$1" > /dev/null 2>&1; }`, - // TODO(#1535): support non-root images properly - "mkdir -p /root && chown root:root /root && export HOME=/root", // package manager detection/abstraction `install_pkg() { NAME=Distribution; test -f /etc/os-release && . /etc/os-release; echo $NAME not supported; exit 11; }`, `if exists apt-get; then install_pkg() { apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y "$1"; }; fi`, @@ -1190,7 +1188,6 @@ func (c *CLIArgs) DockerShellCommands(publicKeys []string) []string { consts.RunnerBinaryPath, "--log-level", strconv.Itoa(c.Runner.LogLevel), "start", - "--home-dir", consts.RunnerHomeDir, "--temp-dir", consts.RunnerTempDir, "--http-port", strconv.Itoa(c.Runner.HTTPPort), "--ssh-port", strconv.Itoa(c.Runner.SSHPort), diff --git a/src/dstack/_internal/core/backends/base/compute.py b/src/dstack/_internal/core/backends/base/compute.py index 13cba1eb53..75a68e77ff 100644 --- a/src/dstack/_internal/core/backends/base/compute.py +++ b/src/dstack/_internal/core/backends/base/compute.py @@ -944,8 +944,6 @@ def get_docker_commands( "unset LD_LIBRARY_PATH && unset LD_PRELOAD", # common functions 'exists() { command -v "$1" > /dev/null 2>&1; }', - # TODO(#1535): support non-root images properly - "mkdir -p /root && chown root:root /root && export HOME=/root", # package manager detection/abstraction "install_pkg() { NAME=Distribution; test -f /etc/os-release && . /etc/os-release; echo $NAME not supported; exit 11; }", 'if exists apt-get; then install_pkg() { apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y "$1"; }; fi', @@ -963,8 +961,6 @@ def get_docker_commands( "--log-level", "6", "start", - "--home-dir", - "/root", "--temp-dir", "/tmp/runner", "--http-port", diff --git a/src/dstack/_internal/core/backends/kubernetes/compute.py b/src/dstack/_internal/core/backends/kubernetes/compute.py index 53feb9cda5..4f6379b173 100644 --- a/src/dstack/_internal/core/backends/kubernetes/compute.py +++ b/src/dstack/_internal/core/backends/kubernetes/compute.py @@ -249,7 +249,6 @@ def run_job( ) ], security_context=client.V1SecurityContext( - # TODO(#1535): support non-root images properly run_as_user=0, run_as_group=0, privileged=job.job_spec.privileged, diff --git a/src/dstack/_internal/server/services/proxy/repo.py b/src/dstack/_internal/server/services/proxy/repo.py index ae7ea19f8d..f8c8d882c8 100644 --- a/src/dstack/_internal/server/services/proxy/repo.py +++ b/src/dstack/_internal/server/services/proxy/repo.py @@ -81,7 +81,7 @@ async def get_service(self, project_name: str, run_name: str) -> Optional[Servic ssh_port = jpd.ssh_port ssh_proxy = jpd.ssh_proxy else: - ssh_destination = "root@localhost" # TODO(#1535): support non-root images properly + ssh_destination = "root@localhost" ssh_port = DSTACK_RUNNER_SSH_PORT job_submission = jobs_services.job_model_to_job_submission(job) jrd = job_submission.job_runtime_data diff --git a/src/dstack/_internal/server/services/ssh.py b/src/dstack/_internal/server/services/ssh.py index a7967d8031..d1ba8ffc83 100644 --- a/src/dstack/_internal/server/services/ssh.py +++ b/src/dstack/_internal/server/services/ssh.py @@ -30,7 +30,7 @@ def container_ssh_tunnel( ssh_port = jpd.ssh_port ssh_proxy = jpd.ssh_proxy else: - ssh_destination = "root@localhost" # TODO(#1535): support non-root images properly + ssh_destination = "root@localhost" ssh_port = DSTACK_RUNNER_SSH_PORT job_submission = jobs_services.job_model_to_job_submission(job) jrd = job_submission.job_runtime_data From d0b4cc3de166d7a45d0bb52735afe8b158398758 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 16 Jan 2026 11:53:12 +0500 Subject: [PATCH 053/187] Optimize fleet instances db queries (#3467) * Optimize fleet instances db queries * Use with_loader_criteria in process_submitted_jobs * Use with_loader_criteria in process_instances * Fix master instance selection * TODO on efficient background processing * Add load_only(JobModel.id) * Skip locking finished jobs in process_runs * Comment on non-repeatable read * Delete unused func --- .../_internal/server/background/__init__.py | 9 ++- .../server/background/tasks/process_fleets.py | 11 +++- .../background/tasks/process_instances.py | 65 ++++++++++++++----- .../server/background/tasks/process_runs.py | 22 ++++++- .../tasks/process_submitted_jobs.py | 23 ++++++- .../_internal/server/services/fleets.py | 4 -- .../_internal/server/services/placement.py | 7 +- 7 files changed, 107 insertions(+), 34 deletions(-) diff --git a/src/dstack/_internal/server/background/__init__.py b/src/dstack/_internal/server/background/__init__.py index 85af7d3315..8577cce6f1 100644 --- a/src/dstack/_internal/server/background/__init__.py +++ b/src/dstack/_internal/server/background/__init__.py @@ -42,7 +42,14 @@ def get_scheduler() -> AsyncIOScheduler: def start_background_tasks() -> AsyncIOScheduler: - # We try to process as many resources as possible without exhausting DB connections. + # Background processing is implemented via in-memory locks on SQLite + # and SELECT FOR UPDATE on Postgres. Locks may be held for a long time. + # This is currently the main bottleneck for scaling dstack processing + # as processing more resources requires more DB connections. + # TODO: Make background processing efficient by committing locks to DB + # and processing outside of DB transactions. + # + # Now we just try to process as many resources as possible without exhausting DB connections. # # Quick tasks can process multiple resources per transaction. # Potentially long tasks process one resource per transaction diff --git a/src/dstack/_internal/server/background/tasks/process_fleets.py b/src/dstack/_internal/server/background/tasks/process_fleets.py index 733029abf8..d369c7d242 100644 --- a/src/dstack/_internal/server/background/tasks/process_fleets.py +++ b/src/dstack/_internal/server/background/tasks/process_fleets.py @@ -5,7 +5,7 @@ from sqlalchemy import select, update from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload, load_only, selectinload +from sqlalchemy.orm import joinedload, load_only, selectinload, with_loader_criteria from dstack._internal.core.models.fleets import FleetSpec, FleetStatus from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason @@ -60,6 +60,9 @@ async def process_fleets(): .options( load_only(FleetModel.id, FleetModel.name), selectinload(FleetModel.instances).load_only(InstanceModel.id), + with_loader_criteria( + InstanceModel, InstanceModel.deleted == False, include_aliases=True + ), ) .order_by(FleetModel.last_processed_at.asc()) .limit(BATCH_SIZE) @@ -72,6 +75,7 @@ async def process_fleets(): .where( InstanceModel.id.not_in(instance_lockset), InstanceModel.fleet_id.in_(fleet_ids), + InstanceModel.deleted == False, ) .options(load_only(InstanceModel.id, InstanceModel.fleet_id)) .order_by(InstanceModel.id) @@ -113,8 +117,11 @@ async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel]) .where(FleetModel.id.in_(fleet_ids)) .options( joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id), - joinedload(FleetModel.project), + with_loader_criteria( + InstanceModel, InstanceModel.deleted == False, include_aliases=True + ), ) + .options(joinedload(FleetModel.project)) .options(joinedload(FleetModel.runs).load_only(RunModel.status)) .execution_options(populate_existing=True) ) diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index 2241c4c6a4..9a14bdc30d 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -11,7 +11,7 @@ from pydantic import ValidationError from sqlalchemy import and_, delete, func, not_, select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload +from sqlalchemy.orm import joinedload, with_loader_criteria from dstack._internal import settings from dstack._internal.core.backends.base.compute import ( @@ -79,7 +79,6 @@ fleet_model_to_fleet, get_create_instance_offers, is_cloud_cluster, - is_fleet_master_instance, ) from dstack._internal.server.services.instances import ( get_instance_configuration, @@ -218,7 +217,12 @@ async def _process_instance(session: AsyncSession, instance: InstanceModel): .where(InstanceModel.id == instance.id) .options(joinedload(InstanceModel.project).joinedload(ProjectModel.backends)) .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) - .options(joinedload(InstanceModel.fleet).joinedload(FleetModel.instances)) + .options( + joinedload(InstanceModel.fleet).joinedload(FleetModel.instances), + with_loader_criteria( + InstanceModel, InstanceModel.deleted == False, include_aliases=True + ), + ) .execution_options(populate_existing=True) ) instance = res.unique().scalar_one() @@ -228,7 +232,12 @@ async def _process_instance(session: AsyncSession, instance: InstanceModel): .where(InstanceModel.id == instance.id) .options(joinedload(InstanceModel.project)) .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) - .options(joinedload(InstanceModel.fleet).joinedload(FleetModel.instances)) + .options( + joinedload(InstanceModel.fleet).joinedload(FleetModel.instances), + with_loader_criteria( + InstanceModel, InstanceModel.deleted == False, include_aliases=True + ), + ) .execution_options(populate_existing=True) ) instance = res.unique().scalar_one() @@ -543,8 +552,11 @@ def _deploy_instance( async def _create_instance(session: AsyncSession, instance: InstanceModel) -> None: - if _need_to_wait_fleet_provisioning(instance): - logger.debug("Waiting for the first instance in the fleet to be provisioned") + master_instance = await _get_fleet_master_instance(session, instance) + if _need_to_wait_fleet_provisioning(instance, master_instance): + logger.debug( + "%s: waiting for the first instance in the fleet to be provisioned", fmt(instance) + ) return try: @@ -576,6 +588,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No placement_group_model = get_placement_group_model_for_instance( placement_group_models=placement_group_models, instance_model=instance, + master_instance_model=master_instance, ) offers = await get_create_instance_offers( project=instance.project, @@ -594,11 +607,15 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No continue compute = backend.compute() assert isinstance(compute, ComputeWithCreateInstanceSupport) - instance_offer = _get_instance_offer_for_instance(instance_offer, instance) + instance_offer = _get_instance_offer_for_instance( + instance_offer=instance_offer, + instance=instance, + master_instance=master_instance, + ) if ( instance.fleet and is_cloud_cluster(instance.fleet) - and is_fleet_master_instance(instance) + and instance.id == master_instance.id and instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT and isinstance(compute, ComputeWithPlacementGroupSupport) and ( @@ -667,7 +684,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No "instance_status": InstanceStatus.PROVISIONING.value, }, ) - if instance.fleet_id and is_fleet_master_instance(instance): + if instance.fleet_id and instance.id == master_instance.id: # Clean up placement groups that did not end up being used. # Flush to update still uncommitted placement groups. await session.flush() @@ -685,7 +702,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No InstanceTerminationReason.NO_OFFERS, "All offers failed" if offers else "No offers found", ) - if instance.fleet and is_fleet_master_instance(instance) and is_cloud_cluster(instance.fleet): + if instance.fleet and instance.id == master_instance.id and is_cloud_cluster(instance.fleet): # Do not attempt to deploy other instances, as they won't determine the correct cluster # backend, region, and placement group without a successfully deployed master instance for sibling_instance in instance.fleet.instances: @@ -694,6 +711,20 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No _mark_terminated(sibling_instance, InstanceTerminationReason.MASTER_FAILED) +async def _get_fleet_master_instance( + session: AsyncSession, instance: InstanceModel +) -> InstanceModel: + # The "master" fleet instance is relevant for cloud clusters only: + # it can be any fixed instance that is chosen to be provisioned first. + res = await session.execute( + select(InstanceModel) + .where(InstanceModel.fleet_id == instance.fleet_id) + .order_by(InstanceModel.instance_num, InstanceModel.created_at) + .limit(1) + ) + return res.scalar_one() + + def _mark_terminated( instance: InstanceModel, termination_reason: InstanceTerminationReason, @@ -1182,15 +1213,17 @@ def _get_termination_deadline(instance: InstanceModel) -> datetime.datetime: return instance.first_termination_retry_at + TERMINATION_RETRY_MAX_DURATION -def _need_to_wait_fleet_provisioning(instance: InstanceModel) -> bool: +def _need_to_wait_fleet_provisioning( + instance: InstanceModel, master_instance: InstanceModel +) -> bool: # Cluster cloud instances should wait for the first fleet instance to be provisioned # so that they are provisioned in the same backend/region if instance.fleet is None: return False if ( - is_fleet_master_instance(instance) - or instance.fleet.instances[0].job_provisioning_data is not None - or instance.fleet.instances[0].status == InstanceStatus.TERMINATED + instance.id == master_instance.id + or master_instance.job_provisioning_data is not None + or master_instance.status == InstanceStatus.TERMINATED ): return False return is_cloud_cluster(instance.fleet) @@ -1199,13 +1232,13 @@ def _need_to_wait_fleet_provisioning(instance: InstanceModel) -> bool: def _get_instance_offer_for_instance( instance_offer: InstanceOfferWithAvailability, instance: InstanceModel, + master_instance: InstanceModel, ) -> InstanceOfferWithAvailability: if instance.fleet is None: return instance_offer fleet = fleet_model_to_fleet(instance.fleet) - master_instance = instance.fleet.instances[0] - master_job_provisioning_data = get_instance_provisioning_data(master_instance) if fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER: + master_job_provisioning_data = get_instance_provisioning_data(master_instance) return get_instance_offer_with_restricted_az( instance_offer=instance_offer, master_job_provisioning_data=master_job_provisioning_data, diff --git a/src/dstack/_internal/server/background/tasks/process_runs.py b/src/dstack/_internal/server/background/tasks/process_runs.py index b4397b95e0..ad42e7ed40 100644 --- a/src/dstack/_internal/server/background/tasks/process_runs.py +++ b/src/dstack/_internal/server/background/tasks/process_runs.py @@ -4,7 +4,7 @@ from sqlalchemy import and_, func, or_, select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import aliased, contains_eager, joinedload, load_only +from sqlalchemy.orm import aliased, contains_eager, joinedload, load_only, with_loader_criteria import dstack._internal.server.services.services.autoscalers as autoscalers from dstack._internal.core.errors import ServerError @@ -111,7 +111,15 @@ async def _process_next_run(): ), ), ) - .options(joinedload(RunModel.jobs).load_only(JobModel.id)) + .options( + joinedload(RunModel.jobs).load_only(JobModel.id), + # No need to lock finished jobs + with_loader_criteria( + JobModel, + JobModel.status.not_in(JobStatus.finished_statuses()), + include_aliases=True, + ), + ) .options(load_only(RunModel.id)) .order_by(RunModel.last_processed_at.asc()) .limit(1) @@ -126,12 +134,20 @@ async def _process_next_run(): JobModel.run_id == run_model.id, JobModel.id.not_in(job_lockset), ) + .options( + load_only(JobModel.id), + with_loader_criteria( + JobModel, + JobModel.status.not_in(JobStatus.finished_statuses()), + include_aliases=True, + ), + ) .order_by(JobModel.id) # take locks in order .with_for_update(skip_locked=True, key_share=True) ) job_models = res.scalars().all() if len(run_model.jobs) != len(job_models): - # Some jobs are locked + # Some jobs are locked or there was a non-repeatable read return job_ids = [j.id for j in run_model.jobs] run_lockset.add(run_model.id) diff --git a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py index d1d86c41aa..e132f83a49 100644 --- a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py @@ -7,7 +7,14 @@ from sqlalchemy import func, or_, select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import contains_eager, joinedload, load_only, noload, selectinload +from sqlalchemy.orm import ( + contains_eager, + joinedload, + load_only, + noload, + selectinload, + with_loader_criteria, +) from dstack._internal.core.backends.base.backend import Backend from dstack._internal.core.backends.base.compute import ( @@ -213,7 +220,12 @@ async def _process_submitted_job( select(JobModel) .where(JobModel.id == job_model.id) .options(joinedload(JobModel.instance)) - .options(joinedload(JobModel.fleet).joinedload(FleetModel.instances)) + .options( + joinedload(JobModel.fleet).joinedload(FleetModel.instances), + with_loader_criteria( + InstanceModel, InstanceModel.deleted == False, include_aliases=True + ), + ) ) job_model = res.unique().scalar_one() res = await session.execute( @@ -221,7 +233,12 @@ async def _process_submitted_job( .where(RunModel.id == job_model.run_id) .options(joinedload(RunModel.project).joinedload(ProjectModel.backends)) .options(joinedload(RunModel.user).load_only(UserModel.name)) - .options(joinedload(RunModel.fleet).joinedload(FleetModel.instances)) + .options( + joinedload(RunModel.fleet).joinedload(FleetModel.instances), + with_loader_criteria( + InstanceModel, InstanceModel.deleted == False, include_aliases=True + ), + ) ) run_model = res.unique().scalar_one() logger.debug("%s: provisioning has started", fmt(job_model)) diff --git a/src/dstack/_internal/server/services/fleets.py b/src/dstack/_internal/server/services/fleets.py index e347829fa4..95ae519d07 100644 --- a/src/dstack/_internal/server/services/fleets.py +++ b/src/dstack/_internal/server/services/fleets.py @@ -728,10 +728,6 @@ def is_cloud_cluster(fleet_model: FleetModel) -> bool: ) -def is_fleet_master_instance(instance: InstanceModel) -> bool: - return instance.fleet is not None and instance.id == instance.fleet.instances[0].id - - def get_fleet_requirements(fleet_spec: FleetSpec) -> Requirements: profile = fleet_spec.merged_profile requirements = Requirements( diff --git a/src/dstack/_internal/server/services/placement.py b/src/dstack/_internal/server/services/placement.py index f0c63f891c..d0c045cdc9 100644 --- a/src/dstack/_internal/server/services/placement.py +++ b/src/dstack/_internal/server/services/placement.py @@ -98,9 +98,10 @@ async def schedule_fleet_placement_groups_deletion( def get_placement_group_model_for_instance( placement_group_models: list[PlacementGroupModel], instance_model: InstanceModel, + master_instance_model: InstanceModel, ) -> Optional[PlacementGroupModel]: placement_group_model = None - if not _is_fleet_master_instance(instance_model): + if instance_model.id != master_instance_model.id: if placement_group_models: placement_group_model = placement_group_models[0] if len(placement_group_models) > 1: @@ -231,7 +232,3 @@ async def create_placement_group( ) placement_group_model.provisioning_data = pgpd.json() return placement_group_model - - -def _is_fleet_master_instance(instance: InstanceModel) -> bool: - return instance.fleet is not None and instance.id == instance.fleet.instances[0].id From 71c12ad72052c276bff3e8fc8bd29e43bf8c067e Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Fri, 16 Jan 2026 07:42:29 +0000 Subject: [PATCH 054/187] Kubernetes: adjust offer GPU count (#3469) Fixes: https://github.com/dstackai/dstack/issues/3468 --- .../core/backends/kubernetes/compute.py | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/dstack/_internal/core/backends/kubernetes/compute.py b/src/dstack/_internal/core/backends/kubernetes/compute.py index 4f6379b173..7f8ef9123f 100644 --- a/src/dstack/_internal/core/backends/kubernetes/compute.py +++ b/src/dstack/_internal/core/backends/kubernetes/compute.py @@ -117,9 +117,12 @@ def __init__(self, config: KubernetesConfig): def get_offers_by_requirements( self, requirements: Requirements ) -> list[InstanceOfferWithAvailability]: + gpu_request = 0 + if (gpu_spec := requirements.resources.gpu) is not None: + gpu_request = _get_gpu_request_from_gpu_spec(gpu_spec) instance_offers: list[InstanceOfferWithAvailability] = [] for node in self.api.list_node().items: - if (instance_offer := _get_instance_offer_from_node(node)) is not None: + if (instance_offer := _get_instance_offer_from_node(node, gpu_request)) is not None: instance_offers.extend( filter_offers_by_requirements([instance_offer], requirements) ) @@ -188,15 +191,15 @@ def run_job( if (cpu_max := resources_spec.cpu.count.max) is not None: resources_limits["cpu"] = str(cpu_max) if (gpu_spec := resources_spec.gpu) is not None: - gpu_min = gpu_spec.count.min - if gpu_min is not None and gpu_min > 0: + if (gpu_request := _get_gpu_request_from_gpu_spec(gpu_spec)) > 0: gpu_resource, node_affinity, node_taint = _get_pod_spec_parameters_for_gpu( self.api, gpu_spec ) - logger.debug("Requesting GPU resource: %s=%d", gpu_resource, gpu_min) + logger.debug("Requesting GPU resource: %s=%d", gpu_resource, gpu_request) + resources_requests[gpu_resource] = str(gpu_request) # Limit must be set (GPU resources cannot be overcommitted) # and must be equal to request. - resources_requests[gpu_resource] = resources_limits[gpu_resource] = str(gpu_min) + resources_limits[gpu_resource] = str(gpu_request) # It should be NoSchedule, but we also add NoExecute toleration just in case. for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]: tolerations.append( @@ -335,7 +338,10 @@ def update_provisioning_data( provisioning_data.hostname = get_or_error(service_spec.cluster_ip) pod_spec = get_or_error(pod.spec) node = self.api.read_node(name=get_or_error(pod_spec.node_name)) - if (instance_offer := _get_instance_offer_from_node(node)) is not None: + # The original offer has a list of GPUs already sliced according to pod spec's GPU resource + # request, which is inferred from dstack's GPUSpec, see _get_gpu_request_from_gpu_spec + gpu_request = len(provisioning_data.instance_type.resources.gpus) + if (instance_offer := _get_instance_offer_from_node(node, gpu_request)) is not None: provisioning_data.instance_type = instance_offer.instance provisioning_data.region = instance_offer.region provisioning_data.price = instance_offer.price @@ -475,7 +481,13 @@ def terminate_gateway( ) -def _get_instance_offer_from_node(node: client.V1Node) -> Optional[InstanceOfferWithAvailability]: +def _get_gpu_request_from_gpu_spec(gpu_spec: GPUSpec) -> int: + return gpu_spec.count.min or 0 + + +def _get_instance_offer_from_node( + node: client.V1Node, gpu_request: int +) -> Optional[InstanceOfferWithAvailability]: try: node_name = get_or_error(get_or_error(node.metadata).name) node_status = get_or_error(node.status) @@ -499,7 +511,7 @@ def _get_instance_offer_from_node(node: client.V1Node) -> Optional[InstanceOffer cpus=cpus, cpu_arch=cpu_arch, memory_mib=memory_mib, - gpus=gpus, + gpus=gpus[:gpu_request], spot=False, disk=Disk(size_mib=disk_size_mib), ), From 395ccb75dc3fa43df9150cbf5b47ded584657f2c Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Fri, 16 Jan 2026 09:07:04 +0000 Subject: [PATCH 055/187] Add missing job status change event for scaling (#3465) Emit the job status change event when a job transitions to `terminating` due to scaling. This case was previously missed because the `job` variable was not inferred as `JobModel`. --- src/dstack/_internal/server/services/runs/replicas.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dstack/_internal/server/services/runs/replicas.py b/src/dstack/_internal/server/services/runs/replicas.py index 43065d96d9..e994e77ddc 100644 --- a/src/dstack/_internal/server/services/runs/replicas.py +++ b/src/dstack/_internal/server/services/runs/replicas.py @@ -75,8 +75,8 @@ async def scale_run_replicas(session: AsyncSession, run_model: RunModel, replica ) # lists of (importance, is_out_of_date, replica_num, jobs) - active_replicas = [] - inactive_replicas = [] + active_replicas: list[tuple[int, bool, int, list[JobModel]]] = [] + inactive_replicas: list[tuple[int, bool, int, list[JobModel]]] = [] for replica_num, replica_jobs in group_jobs_by_replica_latest(run_model.jobs): statuses = set(job.status for job in replica_jobs) @@ -108,8 +108,8 @@ async def scale_run_replicas(session: AsyncSession, run_model: RunModel, replica for job in replica_jobs: if job.status.is_finished() or job.status == JobStatus.TERMINATING: continue - job.status = JobStatus.TERMINATING job.termination_reason = JobTerminationReason.SCALED_DOWN + switch_job_status(session, job, JobStatus.TERMINATING, events.SystemActor()) # background task will process the job later else: scheduled_replicas = 0 From 104834e26166d557561faec9597c9c444d1c3cad Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Fri, 16 Jan 2026 14:41:15 +0000 Subject: [PATCH 056/187] Fix `find_optimal_fleet_with_offers` log message (#3470) --- src/dstack/_internal/server/services/runs/plan.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/dstack/_internal/server/services/runs/plan.py b/src/dstack/_internal/server/services/runs/plan.py index a5b20b15b9..dd1ad1b284 100644 --- a/src/dstack/_internal/server/services/runs/plan.py +++ b/src/dstack/_internal/server/services/runs/plan.py @@ -266,7 +266,10 @@ async def find_optimal_fleet_with_offers( continue if not _run_can_fit_into_fleet(run_spec, candidate_fleet): - logger.debug("Skipping fleet %s from consideration: run cannot fit into fleet") + logger.debug( + "Skipping fleet %s from consideration: run cannot fit into fleet", + candidate_fleet.name, + ) continue instance_offers = _get_instance_offers_in_fleet( From 628bb8b29721123757f931bc4606185e6c2f8349 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Mon, 19 Jan 2026 15:51:14 +0500 Subject: [PATCH 057/187] Fix missing instance lock in delete_fleets (#3471) * Fix missing instance lock in delete_fleets * Handle terminating deleted instances * Fix comment * Fix log message --- .../background/tasks/process_instances.py | 12 ++-- .../_internal/server/services/fleets.py | 59 ++++++++++++------- .../tasks/test_process_instances.py | 28 +++++++++ 3 files changed, 72 insertions(+), 27 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index 9a14bdc30d..454d6ee18a 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -11,7 +11,7 @@ from pydantic import ValidationError from sqlalchemy import and_, delete, func, not_, select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload, with_loader_criteria +from sqlalchemy.orm import joinedload from dstack._internal import settings from dstack._internal.core.backends.base.compute import ( @@ -218,9 +218,8 @@ async def _process_instance(session: AsyncSession, instance: InstanceModel): .options(joinedload(InstanceModel.project).joinedload(ProjectModel.backends)) .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) .options( - joinedload(InstanceModel.fleet).joinedload(FleetModel.instances), - with_loader_criteria( - InstanceModel, InstanceModel.deleted == False, include_aliases=True + joinedload(InstanceModel.fleet).joinedload( + FleetModel.instances.and_(InstanceModel.deleted == False) ), ) .execution_options(populate_existing=True) @@ -233,9 +232,8 @@ async def _process_instance(session: AsyncSession, instance: InstanceModel): .options(joinedload(InstanceModel.project)) .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) .options( - joinedload(InstanceModel.fleet).joinedload(FleetModel.instances), - with_loader_criteria( - InstanceModel, InstanceModel.deleted == False, include_aliases=True + joinedload(InstanceModel.fleet).joinedload( + FleetModel.instances.and_(InstanceModel.deleted == False) ), ) .execution_options(populate_existing=True) diff --git a/src/dstack/_internal/server/services/fleets.py b/src/dstack/_internal/server/services/fleets.py index 95ae519d07..588f34698d 100644 --- a/src/dstack/_internal/server/services/fleets.py +++ b/src/dstack/_internal/server/services/fleets.py @@ -42,7 +42,12 @@ ) from dstack._internal.core.models.projects import Project from dstack._internal.core.models.resources import ResourcesSpec -from dstack._internal.core.models.runs import JobProvisioningData, Requirements, get_policy_map +from dstack._internal.core.models.runs import ( + JobProvisioningData, + Requirements, + RunStatus, + get_policy_map, +) from dstack._internal.core.models.users import GlobalRole from dstack._internal.core.services import validate_dstack_resource_name from dstack._internal.core.services.diff import ModelDiff, copy_model, diff_models @@ -53,6 +58,7 @@ JobModel, MemberModel, ProjectModel, + RunModel, UserModel, ) from dstack._internal.server.services import events @@ -613,48 +619,61 @@ async def delete_fleets( instance_nums: Optional[List[int]] = None, ): res = await session.execute( - select(FleetModel) + select(FleetModel.id) .where( FleetModel.project_id == project.id, FleetModel.name.in_(names), FleetModel.deleted == False, ) - .options(joinedload(FleetModel.instances)) + .order_by(FleetModel.id) # take locks in order + .with_for_update(key_share=True) ) - fleet_models = res.scalars().unique().all() - fleets_ids = sorted([f.id for f in fleet_models]) - instances_ids = sorted([i.id for f in fleet_models for i in f.instances]) - await session.commit() - logger.info("Deleting fleets: %s", [v.name for v in fleet_models]) + fleets_ids = list(res.scalars().unique().all()) + res = await session.execute( + select(InstanceModel.id) + .where( + InstanceModel.fleet_id.in_(fleets_ids), + InstanceModel.deleted == False, + ) + .order_by(InstanceModel.id) # take locks in order + .with_for_update(key_share=True) + ) + instances_ids = list(res.scalars().unique().all()) + if is_db_sqlite(): + # Start new transaction to see committed changes after lock + await session.commit() async with ( get_locker(get_db().dialect_name).lock_ctx(FleetModel.__tablename__, fleets_ids), get_locker(get_db().dialect_name).lock_ctx(InstanceModel.__tablename__, instances_ids), ): - # Refetch after lock - # TODO: Lock instances with FOR UPDATE? - # TODO: Do not lock fleet when deleting only instances + # Refetch after lock. + # TODO: Do not lock fleet when deleting only instances. res = await session.execute( select(FleetModel) - .where( - FleetModel.project_id == project.id, - FleetModel.name.in_(names), - FleetModel.deleted == False, - ) + .where(FleetModel.id.in_(fleets_ids)) .options( - selectinload(FleetModel.instances) + joinedload(FleetModel.instances.and_(InstanceModel.id.in_(instances_ids))) .joinedload(InstanceModel.jobs) .load_only(JobModel.id) ) - .options(selectinload(FleetModel.runs)) + .options( + joinedload( + FleetModel.runs.and_(RunModel.status.not_in(RunStatus.finished_statuses())) + ) + ) .execution_options(populate_existing=True) - .order_by(FleetModel.id) # take locks in order - .with_for_update(key_share=True) ) fleet_models = res.scalars().unique().all() fleets = [fleet_model_to_fleet(m) for m in fleet_models] for fleet in fleets: if fleet.spec.configuration.ssh_config is not None: _check_can_manage_ssh_fleets(user=user, project=project) + if instance_nums is None: + logger.info("Deleting fleets: %s", [f.name for f in fleet_models]) + else: + logger.info( + "Deleting fleets %s instances %s", [f.name for f in fleet_models], instance_nums + ) for fleet_model in fleet_models: _terminate_fleet_instances(fleet_model=fleet_model, instance_nums=instance_nums) # TERMINATING fleets are deleted by process_fleets after instances are terminated diff --git a/src/tests/_internal/server/background/tasks/test_process_instances.py b/src/tests/_internal/server/background/tasks/test_process_instances.py index a72dc0c165..38bffc4421 100644 --- a/src/tests/_internal/server/background/tasks/test_process_instances.py +++ b/src/tests/_internal/server/background/tasks/test_process_instances.py @@ -597,6 +597,34 @@ async def test_terminate(self, test_db, session: AsyncSession): assert instance.deleted_at is not None assert instance.finished_at is not None + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_terminates_terminating_deleted_instance(self, test_db, session: AsyncSession): + # There was a race condition when instance could stay in Terminating while marked as deleted. + # TODO: Drop this after all such "bad" instances are processed. + project = await create_project(session=session) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.TERMINATING + ) + instance.deleted = True + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT + instance.last_job_processed_at = instance.deleted_at = ( + get_current_datetime() + dt.timedelta(minutes=-19) + ) + await session.commit() + + with self.mock_terminate_in_backend() as mock: + await process_instances() + mock.assert_called_once() + + await session.refresh(instance) + + assert instance is not None + assert instance.status == InstanceStatus.TERMINATED + assert instance.deleted == True + assert instance.deleted_at is not None + assert instance.finished_at is not None + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @pytest.mark.parametrize( From a07ef352779fad5aa2fdc0334314becc940ddd47 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Mon, 19 Jan 2026 17:16:21 +0500 Subject: [PATCH 058/187] Optimize list and get fleets (#3472) * Do not include deleted instances when listing fleets * Do not include deleted instances when getting fleet * Optimize select in generate_volume_name --- src/dstack/_internal/server/routers/fleets.py | 2 ++ .../_internal/server/services/fleets.py | 27 ++++++++++++------- .../_internal/server/services/volumes.py | 9 +++++-- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/src/dstack/_internal/server/routers/fleets.py b/src/dstack/_internal/server/routers/fleets.py index d423134675..a436d1123a 100644 --- a/src/dstack/_internal/server/routers/fleets.py +++ b/src/dstack/_internal/server/routers/fleets.py @@ -47,6 +47,7 @@ async def list_fleets( """ Returns all fleets and instances within them visible to user sorted by descending `created_at`. `project_name` and `only_active` can be specified as filters. + Includes only active fleet instances. To list all fleet instances, use `/api/instances/list`. The results are paginated. To get the next page, pass `created_at` and `id` of the last fleet from the previous page as `prev_created_at` and `prev_id`. @@ -72,6 +73,7 @@ async def list_project_fleets( ): """ Returns all fleets in the project. + Includes only active fleet instances. To list all fleet instances, use `/api/instances/list`. """ _, project = user_project return CustomORJSONResponse( diff --git a/src/dstack/_internal/server/services/fleets.py b/src/dstack/_internal/server/services/fleets.py index 588f34698d..19e4a77e64 100644 --- a/src/dstack/_internal/server/services/fleets.py +++ b/src/dstack/_internal/server/services/fleets.py @@ -180,9 +180,7 @@ async def list_fleets( limit=limit, ascending=ascending, ) - return [ - fleet_model_to_fleet(v, include_deleted_instances=not only_active) for v in fleet_models - ] + return [fleet_model_to_fleet(v) for v in fleet_models] async def list_projects_fleet_models( @@ -227,7 +225,7 @@ async def list_projects_fleet_models( .where(*filters) .order_by(*order_by) .limit(limit) - .options(joinedload(FleetModel.instances)) + .options(joinedload(FleetModel.instances.and_(InstanceModel.deleted == False))) ) fleet_models = list(res.unique().scalars().all()) return fleet_models @@ -256,7 +254,9 @@ async def list_project_fleet_models( if not include_deleted: filters.append(FleetModel.deleted == False) res = await session.execute( - select(FleetModel).where(*filters).options(joinedload(FleetModel.instances)) + select(FleetModel) + .where(*filters) + .options(joinedload(FleetModel.instances.and_(InstanceModel.deleted == False))) ) return list(res.unique().scalars().all()) @@ -293,7 +293,9 @@ async def get_project_fleet_model_by_id( FleetModel.project_id == project.id, ] res = await session.execute( - select(FleetModel).where(*filters).options(joinedload(FleetModel.instances)) + select(FleetModel) + .where(*filters) + .options(joinedload(FleetModel.instances.and_(InstanceModel.deleted == False))) ) return res.unique().scalar_one_or_none() @@ -311,7 +313,9 @@ async def get_project_fleet_model_by_name( if not include_deleted: filters.append(FleetModel.deleted == False) res = await session.execute( - select(FleetModel).where(*filters).options(joinedload(FleetModel.instances)) + select(FleetModel) + .where(*filters) + .options(joinedload(FleetModel.instances.and_(InstanceModel.deleted == False))) ) return res.unique().scalar_one_or_none() @@ -717,8 +721,13 @@ def get_fleet_spec(fleet_model: FleetModel) -> FleetSpec: async def generate_fleet_name(session: AsyncSession, project: ProjectModel) -> str: - fleet_models = await list_project_fleet_models(session=session, project=project) - names = {v.name for v in fleet_models} + res = await session.execute( + select(FleetModel.name).where( + FleetModel.project_id == project.id, + FleetModel.deleted == False, + ) + ) + names = set(res.scalars().all()) while True: name = random_names.generate_name() if name not in names: diff --git a/src/dstack/_internal/server/services/volumes.py b/src/dstack/_internal/server/services/volumes.py index fa3471192d..eb8f4bab64 100644 --- a/src/dstack/_internal/server/services/volumes.py +++ b/src/dstack/_internal/server/services/volumes.py @@ -380,8 +380,13 @@ def instance_model_to_volume_instance(instance_model: InstanceModel) -> VolumeIn async def generate_volume_name(session: AsyncSession, project: ProjectModel) -> str: - volume_models = await list_project_volume_models(session=session, project=project) - names = {v.name for v in volume_models} + res = await session.execute( + select(VolumeModel.name).where( + VolumeModel.project_id == project.id, + VolumeModel.deleted == False, + ) + ) + names = set(res.scalars().all()) while True: name = random_names.generate_name() if name not in names: From 811643f5fa72916bef49faec9cb4e0ca61968863 Mon Sep 17 00:00:00 2001 From: Alexander <4584443+DragonStuff@users.noreply.github.com> Date: Tue, 20 Jan 2026 16:09:44 +0900 Subject: [PATCH 059/187] feat(logging): add fluent-bit log shipping (#3431) * feat(logging): add fluent-bit log shipping Implements #3430. This PR is partially implemented using Cursor. * Fix pyright errors by using try/except/else pattern for optional imports * refactor(fluentbit): cleanup protocol lambdas and address codex comments * feat(fluentbit): validate next_token format and raise ServerClientError for malformed tokens * chore(fluentbit): address quick comments * feat(fluentbit): add tag prefix support to HTTPFluentBitWriter --- docs/docs/guides/server-deployment.md | 80 ++- docs/docs/reference/environment-variables.md | 7 + pyproject.toml | 7 +- .../server/services/logs/__init__.py | 24 + .../server/services/logs/fluentbit.py | 338 +++++++++ src/dstack/_internal/server/settings.py | 9 + .../server/services/test_fluentbit_logs.py | 659 ++++++++++++++++++ 7 files changed, 1120 insertions(+), 4 deletions(-) create mode 100644 src/dstack/_internal/server/services/logs/fluentbit.py create mode 100644 src/tests/_internal/server/services/test_fluentbit_logs.py diff --git a/docs/docs/guides/server-deployment.md b/docs/docs/guides/server-deployment.md index f1d7546d77..dc5093f2f2 100644 --- a/docs/docs/guides/server-deployment.md +++ b/docs/docs/guides/server-deployment.md @@ -159,7 +159,7 @@ $ DSTACK_DATABASE_URL=postgresql+asyncpg://user:password@db-host:5432/dstack dst By default, `dstack` stores workload logs locally in `~/.dstack/server/projects//logs`. For multi-replica server deployments, it's required to store logs externally. -`dstack` supports storing logs using AWS CloudWatch or GCP Logging. +`dstack` supports storing logs using AWS CloudWatch, GCP Logging, or Fluent-bit with Elasticsearch / Opensearch. ### AWS CloudWatch @@ -222,6 +222,78 @@ To store logs using GCP Logging, set the `DSTACK_SERVER_GCP_LOGGING_PROJECT` env +### Fluent-bit + +To store logs using Fluent-bit, set the `DSTACK_SERVER_FLUENTBIT_HOST` environment variable. +Fluent-bit supports two modes depending on how you want to access logs. + +=== "Full mode" + + Logs are shipped to Fluent-bit and can be read back through the dstack UI and CLI via Elasticsearch or OpenSearch. + Use this mode when you want a complete integration with log viewing in dstack: + + ```shell + $ DSTACK_SERVER_FLUENTBIT_HOST=fluentbit.example.com \ + DSTACK_SERVER_ELASTICSEARCH_HOST=https://elasticsearch.example.com:9200 \ + dstack server + ``` + +=== "Ship-only mode" + + Logs are forwarded to Fluent-bit but cannot be read through `dstack`. + The dstack UI/CLI will show empty logs. Use this mode when: + + - You have an existing logging infrastructure (Kibana, Grafana, Datadog, etc.) + - You only need to forward logs without reading them back through dstack + - You want to reduce operational complexity by not running Elasticsearch/OpenSearch + + ```shell + $ DSTACK_SERVER_FLUENTBIT_HOST=fluentbit.example.com \ + dstack server + ``` + +??? info "Additional configuration" + The following optional environment variables can be used to customize the Fluent-bit integration: + + **Fluent-bit settings:** + + - `DSTACK_SERVER_FLUENTBIT_PORT` – The Fluent-bit port. Defaults to `24224`. + - `DSTACK_SERVER_FLUENTBIT_PROTOCOL` – The protocol to use: `forward` or `http`. Defaults to `forward`. + - `DSTACK_SERVER_FLUENTBIT_TAG_PREFIX` – The tag prefix for logs. Defaults to `dstack`. + + **Elasticsearch/OpenSearch settings (for full mode only):** + + - `DSTACK_SERVER_ELASTICSEARCH_HOST` – The Elasticsearch/OpenSearch host for reading logs. If not set, runs in ship-only mode. + - `DSTACK_SERVER_ELASTICSEARCH_INDEX` – The Elasticsearch/OpenSearch index pattern. Defaults to `dstack-logs`. + - `DSTACK_SERVER_ELASTICSEARCH_API_KEY` – The Elasticsearch/OpenSearch API key for authentication. + +??? info "Fluent-bit configuration" + Configure Fluent-bit to receive logs and forward them to Elasticsearch or OpenSearch. Example configuration: + + ```ini + [INPUT] + Name forward + Listen 0.0.0.0 + Port 24224 + + [OUTPUT] + Name es + Match dstack.* + Host elasticsearch.example.com + Port 9200 + Index dstack-logs + Suppress_Type_Name On + ``` + +??? info "Required dependencies" + To use Fluent-bit log storage, install the `fluentbit` extras: + + ```shell + $ pip install "dstack[all]" -U + # or + $ pip install "dstack[fluentbit]" -U + ``` + ## File storage When using [files](../concepts/dev-environments.md#files) or [repos](../concepts/dev-environments.md#repos), `dstack` uploads local files and diffs to the server so that you can have access to them within runs. By default, the files are stored in the DB and each upload is limited to 2MB. You can configure an object storage to be used for uploads and increase the default limit by setting the `DSTACK_SERVER_CODE_UPLOAD_LIMIT` environment variable @@ -426,8 +498,10 @@ If a deployment is stuck due to a deadlock when applying DB migrations, try scal ??? info "Can I run multiple replicas of dstack server?" - Yes, you can if you configure `dstack` to use [PostgreSQL](#postgresql) and [AWS CloudWatch](#aws-cloudwatch). + Yes, you can if you configure `dstack` to use [PostgreSQL](#postgresql) and an external log storage + such as [AWS CloudWatch](#aws-cloudwatch), [GCP Logging](#gcp-logging), or [Fluent-bit](#fluent-bit). ??? info "Does dstack server support blue-green or rolling deployments?" - Yes, it does if you configure `dstack` to use [PostgreSQL](#postgresql) and [AWS CloudWatch](#aws-cloudwatch). + Yes, it does if you configure `dstack` to use [PostgreSQL](#postgresql) and an external log storage + such as [AWS CloudWatch](#aws-cloudwatch), [GCP Logging](#gcp-logging), or [Fluent-bit](#fluent-bit). diff --git a/docs/docs/reference/environment-variables.md b/docs/docs/reference/environment-variables.md index 4575f1b8f8..62ce97cd12 100644 --- a/docs/docs/reference/environment-variables.md +++ b/docs/docs/reference/environment-variables.md @@ -113,6 +113,13 @@ For more details on the options below, refer to the [server deployment](../guide - `DSTACK_SERVER_CLOUDWATCH_LOG_GROUP`{ #DSTACK_SERVER_CLOUDWATCH_LOG_GROUP } – The CloudWatch Logs group for storing workloads logs. If not set, the default file-based log storage is used. - `DSTACK_SERVER_CLOUDWATCH_LOG_REGION`{ #DSTACK_SERVER_CLOUDWATCH_LOG_REGION } – The CloudWatch Logs region. Defaults to `None`. - `DSTACK_SERVER_GCP_LOGGING_PROJECT`{ #DSTACK_SERVER_GCP_LOGGING_PROJECT } – The GCP Logging project for storing workloads logs. If not set, the default file-based log storage is used. +- `DSTACK_SERVER_FLUENTBIT_HOST`{ #DSTACK_SERVER_FLUENTBIT_HOST } – The Fluent-bit host for log forwarding. If set, enables Fluent-bit log storage. +- `DSTACK_SERVER_FLUENTBIT_PORT`{ #DSTACK_SERVER_FLUENTBIT_PORT } – The Fluent-bit port. Defaults to `24224`. +- `DSTACK_SERVER_FLUENTBIT_PROTOCOL`{ #DSTACK_SERVER_FLUENTBIT_PROTOCOL } – The protocol to use: `forward` or `http`. Defaults to `forward`. +- `DSTACK_SERVER_FLUENTBIT_TAG_PREFIX`{ #DSTACK_SERVER_FLUENTBIT_TAG_PREFIX } – The tag prefix for logs. Defaults to `dstack`. +- `DSTACK_SERVER_ELASTICSEARCH_HOST`{ #DSTACK_SERVER_ELASTICSEARCH_HOST } – The Elasticsearch/OpenSearch host for reading logs back through dstack. Optional; if not set, Fluent-bit runs in ship-only mode (logs are forwarded but not readable through dstack UI/CLI). +- `DSTACK_SERVER_ELASTICSEARCH_INDEX`{ #DSTACK_SERVER_ELASTICSEARCH_INDEX } – The Elasticsearch/OpenSearch index pattern. Defaults to `dstack-logs`. +- `DSTACK_SERVER_ELASTICSEARCH_API_KEY`{ #DSTACK_SERVER_ELASTICSEARCH_API_KEY } – The Elasticsearch/OpenSearch API key for authentication. - `DSTACK_ENABLE_PROMETHEUS_METRICS`{ #DSTACK_ENABLE_PROMETHEUS_METRICS } — Enables Prometheus metrics collection and export. - `DSTACK_DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE`{ #DSTACK_DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE } – Request body size limit for services running with a gateway, in bytes. Defaults to 64 MiB. - `DSTACK_SERVICE_CLIENT_TIMEOUT`{ #DSTACK_SERVICE_CLIENT_TIMEOUT } – Timeout in seconds for HTTP requests sent from the in-server proxy and gateways to service replicas. Defaults to 60. diff --git a/pyproject.toml b/pyproject.toml index a8d635e7c9..2fe97f2cbb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -215,6 +215,11 @@ nebius = [ "nebius>=0.3.4,<0.4; python_version >= '3.10'", "dstack[server]", ] +fluentbit = [ + "fluent-logger>=0.10.0", + "elasticsearch>=8.0.0", + "dstack[server]", +] all = [ - "dstack[gateway,server,aws,azure,gcp,verda,kubernetes,lambda,nebius,oci]", + "dstack[gateway,server,aws,azure,gcp,verda,kubernetes,lambda,nebius,oci,fluentbit]", ] diff --git a/src/dstack/_internal/server/services/logs/__init__.py b/src/dstack/_internal/server/services/logs/__init__.py index 5b06ff4ad2..1f8565d49c 100644 --- a/src/dstack/_internal/server/services/logs/__init__.py +++ b/src/dstack/_internal/server/services/logs/__init__.py @@ -8,6 +8,7 @@ from dstack._internal.server.schemas.logs import PollLogsRequest from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent from dstack._internal.server.services.logs import aws as aws_logs +from dstack._internal.server.services.logs import fluentbit as fluentbit_logs from dstack._internal.server.services.logs import gcp as gcp_logs from dstack._internal.server.services.logs.base import ( LogStorage, @@ -57,6 +58,29 @@ def get_log_storage() -> LogStorage: logger.debug("Using GCP Logs storage") else: logger.error("Cannot use GCP Logs storage: GCP deps are not installed") + elif settings.SERVER_FLUENTBIT_HOST: + if fluentbit_logs.FLUENTBIT_AVAILABLE: + try: + _log_storage = fluentbit_logs.FluentBitLogStorage( + host=settings.SERVER_FLUENTBIT_HOST, + port=settings.SERVER_FLUENTBIT_PORT, + protocol=settings.SERVER_FLUENTBIT_PROTOCOL, + tag_prefix=settings.SERVER_FLUENTBIT_TAG_PREFIX, + es_host=settings.SERVER_ELASTICSEARCH_HOST, + es_index=settings.SERVER_ELASTICSEARCH_INDEX, + es_api_key=settings.SERVER_ELASTICSEARCH_API_KEY, + ) + except LogStorageError as e: + logger.error("Failed to initialize Fluent-bit Logs storage: %s", e) + except Exception: + logger.exception("Got exception when initializing Fluent-bit Logs storage") + else: + if settings.SERVER_ELASTICSEARCH_HOST: + logger.debug("Using Fluent-bit Logs storage with Elasticsearch/OpenSearch") + else: + logger.debug("Using Fluent-bit Logs storage in ship-only mode") + else: + logger.error("Cannot use Fluent-bit Logs storage: fluent-logger is not installed") if _log_storage is None: _log_storage = FileLogStorage() logger.debug("Using file-based storage") diff --git a/src/dstack/_internal/server/services/logs/fluentbit.py b/src/dstack/_internal/server/services/logs/fluentbit.py new file mode 100644 index 0000000000..b45b2988d5 --- /dev/null +++ b/src/dstack/_internal/server/services/logs/fluentbit.py @@ -0,0 +1,338 @@ +from typing import List, Optional, Protocol +from uuid import UUID + +import httpx + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.logs import ( + JobSubmissionLogs, + LogEvent, + LogEventSource, + LogProducer, +) +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.schemas.logs import PollLogsRequest +from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent +from dstack._internal.server.services.logs.base import ( + LogStorage, + LogStorageError, + unix_time_ms_to_datetime, +) +from dstack._internal.utils.common import batched +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +ELASTICSEARCH_AVAILABLE = True +try: + from elasticsearch import Elasticsearch + from elasticsearch.exceptions import ApiError, TransportError +except ImportError: + ELASTICSEARCH_AVAILABLE = False +else: + ElasticsearchError: tuple = (ApiError, TransportError) # type: ignore[misc] + + class ElasticsearchReader: + """Reads logs from Elasticsearch or OpenSearch.""" + + def __init__( + self, + host: str, + index: str, + api_key: Optional[str] = None, + ) -> None: + if api_key: + self._client = Elasticsearch(hosts=[host], api_key=api_key) + else: + self._client = Elasticsearch(hosts=[host]) + self._index = index + # Verify connection + try: + self._client.info() + except ElasticsearchError as e: + raise LogStorageError(f"Failed to connect to Elasticsearch/OpenSearch: {e}") from e + + def read( + self, + stream_name: str, + request: PollLogsRequest, + ) -> JobSubmissionLogs: + sort_order = "desc" if request.descending else "asc" + + query: dict = { + "bool": { + "must": [ + {"term": {"stream.keyword": stream_name}}, + ] + } + } + + if request.start_time: + query["bool"].setdefault("filter", []).append( + {"range": {"@timestamp": {"gt": request.start_time.isoformat()}}} + ) + if request.end_time: + query["bool"].setdefault("filter", []).append( + {"range": {"@timestamp": {"lt": request.end_time.isoformat()}}} + ) + + search_params: dict = { + "index": self._index, + "query": query, + "sort": [ + {"@timestamp": {"order": sort_order}}, + {"_id": {"order": sort_order}}, + ], + "size": request.limit, + } + + if request.next_token: + parts = request.next_token.split(":", 1) + if len(parts) != 2 or not parts[0] or not parts[1]: + raise ServerClientError( + f"Invalid next_token: {request.next_token}. " + "Must be in format 'timestamp:document_id'." + ) + search_params["search_after"] = [parts[0], parts[1]] + + try: + response = self._client.search(**search_params) + except ElasticsearchError as e: + logger.error("Elasticsearch/OpenSearch search error: %s", e) + raise LogStorageError(f"Elasticsearch/OpenSearch error: {e}") from e + + hits = response.get("hits", {}).get("hits", []) + logs = [] + last_sort_values = None + + for hit in hits: + source = hit.get("_source", {}) + timestamp_str = source.get("@timestamp") + message = source.get("message", "") + + if timestamp_str: + from datetime import datetime + + try: + timestamp = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00")) + except ValueError: + continue + else: + continue + + logs.append( + LogEvent( + timestamp=timestamp, + log_source=LogEventSource.STDOUT, + message=message, + ) + ) + + sort_values = hit.get("sort") + if sort_values and len(sort_values) >= 2: + last_sort_values = sort_values + + next_token = None + if len(logs) == request.limit and last_sort_values is not None: + next_token = f"{last_sort_values[0]}:{last_sort_values[1]}" + + return JobSubmissionLogs( + logs=logs, + next_token=next_token, + ) + + def close(self) -> None: + self._client.close() + + +FLUENTBIT_AVAILABLE = True +try: + from fluent import sender as fluent_sender +except ImportError: + FLUENTBIT_AVAILABLE = False +else: + + class FluentBitWriter(Protocol): + def write(self, tag: str, records: List[dict]) -> None: ... + def close(self) -> None: ... + + class LogReader(Protocol): + def read(self, stream_name: str, request: PollLogsRequest) -> JobSubmissionLogs: ... + def close(self) -> None: ... + + class HTTPFluentBitWriter: + """Writes logs to Fluent-bit via HTTP POST.""" + + def __init__(self, host: str, port: int, tag_prefix: str) -> None: + self._endpoint = f"http://{host}:{port}" + self._client = httpx.Client(timeout=30.0) + self._tag_prefix = tag_prefix + + def write(self, tag: str, records: List[dict]) -> None: + prefixed_tag = f"{self._tag_prefix}.{tag}" if self._tag_prefix else tag + for record in records: + try: + response = self._client.post( + f"{self._endpoint}/{prefixed_tag}", + json=record, + headers={"Content-Type": "application/json"}, + ) + response.raise_for_status() + except httpx.HTTPStatusError as e: + logger.error( + "Fluent-bit HTTP request failed with status %d: %s", + e.response.status_code, + e.response.text, + ) + raise LogStorageError( + f"Fluent-bit HTTP error: status {e.response.status_code}" + ) from e + except httpx.HTTPError as e: + logger.error("Failed to write log to Fluent-bit via HTTP: %s", e) + raise LogStorageError(f"Fluent-bit HTTP error: {e}") from e + + def close(self) -> None: + self._client.close() + + class ForwardFluentBitWriter: + """Writes logs to Fluent-bit using Forward protocol.""" + + def __init__(self, host: str, port: int, tag_prefix: str) -> None: + self._sender = fluent_sender.FluentSender(tag_prefix, host=host, port=port) + self._tag_prefix = tag_prefix + + def write(self, tag: str, records: List[dict]) -> None: + for record in records: + if not self._sender.emit(tag, record): + error = self._sender.last_error + logger.error("Failed to write log to Fluent-bit via Forward: %s", error) + self._sender.clear_last_error() + raise LogStorageError(f"Fluent-bit Forward error: {error}") + + def close(self) -> None: + self._sender.close() + + class NullLogReader: + """ + Null reader for ship-only mode (no Elasticsearch/OpenSearch configured). + + Returns empty logs. Useful when logs are shipped to an external system + that is accessed directly rather than through dstack. + """ + + def read(self, stream_name: str, request: PollLogsRequest) -> JobSubmissionLogs: + return JobSubmissionLogs(logs=[], next_token=None) + + def close(self) -> None: + pass + + class FluentBitLogStorage(LogStorage): + """ + Log storage using Fluent-bit for writing and optionally Elasticsearch/OpenSearch for reading. + + Supports two modes: + - Full mode: Writes to Fluent-bit and reads from Elasticsearch/OpenSearch + - Ship-only mode: Writes to Fluent-bit only (no reading, returns empty logs) + """ + + MAX_BATCH_SIZE = 100 + + def __init__( + self, + host: str, + port: int, + protocol: str, + tag_prefix: str, + es_host: Optional[str] = None, + es_index: str = "dstack-logs", + es_api_key: Optional[str] = None, + ) -> None: + self._tag_prefix = tag_prefix + + if protocol == "http": + self._writer: FluentBitWriter = HTTPFluentBitWriter( + host=host, port=port, tag_prefix=tag_prefix + ) + elif protocol == "forward": + self._writer = ForwardFluentBitWriter(host=host, port=port, tag_prefix=tag_prefix) + else: + raise LogStorageError(f"Unsupported Fluent-bit protocol: {protocol}") + + self._reader: LogReader + if es_host: + if not ELASTICSEARCH_AVAILABLE: + raise LogStorageError( + "Elasticsearch/OpenSearch host configured but elasticsearch package " + "is not installed. Install with: pip install elasticsearch" + ) + self._reader = ElasticsearchReader( + host=es_host, + index=es_index, + api_key=es_api_key, + ) + logger.debug( + "Fluent-bit log storage initialized with Elasticsearch/OpenSearch reader" + ) + else: + self._reader = NullLogReader() + logger.info( + "Fluent-bit log storage initialized in ship-only mode " + "(no Elasticsearch/OpenSearch configured for reading)" + ) + + def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs: + producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB + stream_name = self._get_stream_name( + project_name=project.name, + run_name=request.run_name, + job_submission_id=request.job_submission_id, + producer=producer, + ) + return self._reader.read(stream_name=stream_name, request=request) + + def write_logs( + self, + project: ProjectModel, + run_name: str, + job_submission_id: UUID, + runner_logs: List[RunnerLogEvent], + job_logs: List[RunnerLogEvent], + ) -> None: + producers_with_logs = [(LogProducer.RUNNER, runner_logs), (LogProducer.JOB, job_logs)] + for producer, producer_logs in producers_with_logs: + if not producer_logs: + continue + stream_name = self._get_stream_name( + project_name=project.name, + run_name=run_name, + job_submission_id=job_submission_id, + producer=producer, + ) + self._write_logs_to_stream(stream_name=stream_name, logs=producer_logs) + + def _write_logs_to_stream(self, stream_name: str, logs: List[RunnerLogEvent]) -> None: + for batch in batched(logs, self.MAX_BATCH_SIZE): + records = [] + for log in batch: + message = log.message.decode(errors="replace") + timestamp = unix_time_ms_to_datetime(log.timestamp) + records.append( + { + "message": message, + "@timestamp": timestamp.isoformat(), + "stream": stream_name, + } + ) + self._writer.write(tag=stream_name, records=records) + + def close(self) -> None: + try: + self._writer.close() + finally: + self._reader.close() + + def _get_stream_name( + self, project_name: str, run_name: str, job_submission_id: UUID, producer: LogProducer + ) -> str: + return f"{project_name}/{run_name}/{job_submission_id}/{producer.value}" diff --git a/src/dstack/_internal/server/settings.py b/src/dstack/_internal/server/settings.py index 74d1d7b8d5..6e5c8e4bc1 100644 --- a/src/dstack/_internal/server/settings.py +++ b/src/dstack/_internal/server/settings.py @@ -78,6 +78,15 @@ SERVER_GCP_LOGGING_PROJECT = os.getenv("DSTACK_SERVER_GCP_LOGGING_PROJECT") +SERVER_FLUENTBIT_HOST = os.getenv("DSTACK_SERVER_FLUENTBIT_HOST") +SERVER_FLUENTBIT_PORT = int(os.getenv("DSTACK_SERVER_FLUENTBIT_PORT", "24224")) +SERVER_FLUENTBIT_PROTOCOL = os.getenv("DSTACK_SERVER_FLUENTBIT_PROTOCOL", "forward") +SERVER_FLUENTBIT_TAG_PREFIX = os.getenv("DSTACK_SERVER_FLUENTBIT_TAG_PREFIX", "dstack") + +SERVER_ELASTICSEARCH_HOST = os.getenv("DSTACK_SERVER_ELASTICSEARCH_HOST") +SERVER_ELASTICSEARCH_INDEX = os.getenv("DSTACK_SERVER_ELASTICSEARCH_INDEX", "dstack-logs") +SERVER_ELASTICSEARCH_API_KEY = os.getenv("DSTACK_SERVER_ELASTICSEARCH_API_KEY") + SERVER_METRICS_RUNNING_TTL_SECONDS = environ.get_int( "DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS", default=3600 ) diff --git a/src/tests/_internal/server/services/test_fluentbit_logs.py b/src/tests/_internal/server/services/test_fluentbit_logs.py new file mode 100644 index 0000000000..937838e016 --- /dev/null +++ b/src/tests/_internal/server/services/test_fluentbit_logs.py @@ -0,0 +1,659 @@ +from datetime import datetime, timezone +from unittest.mock import Mock, patch +from uuid import UUID + +import pytest +import pytest_asyncio +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.server.models import ProjectModel +from dstack._internal.server.schemas.logs import PollLogsRequest +from dstack._internal.server.schemas.runner import LogEvent as RunnerLogEvent +from dstack._internal.server.services.logs.base import LogStorageError +from dstack._internal.server.services.logs.fluentbit import ( + ELASTICSEARCH_AVAILABLE, + FLUENTBIT_AVAILABLE, +) +from dstack._internal.server.testing.common import create_project + +pytestmark = pytest.mark.skipif(not FLUENTBIT_AVAILABLE, reason="fluent-logger not installed") + +# Conditionally import classes that are only defined when FLUENTBIT_AVAILABLE is True +if FLUENTBIT_AVAILABLE: + from dstack._internal.server.services.logs.fluentbit import ( + FluentBitLogStorage, + ForwardFluentBitWriter, + HTTPFluentBitWriter, + NullLogReader, + ) + + if ELASTICSEARCH_AVAILABLE: + from dstack._internal.server.services.logs.fluentbit import ElasticsearchReader + + +class TestNullLogReader: + """Tests for the NullLogReader (ship-only mode).""" + + def test_read_returns_empty_logs(self): + reader = NullLogReader() + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=100, + ) + result = reader.read("test-stream", request) + + assert result.logs == [] + assert result.next_token is None + + def test_close_does_nothing(self): + reader = NullLogReader() + reader.close() # Should not raise + + +class TestHTTPFluentBitWriter: + """Tests for the HTTPFluentBitWriter.""" + + @pytest.fixture + def mock_httpx_client(self): + with patch("dstack._internal.server.services.logs.fluentbit.httpx.Client") as mock: + yield mock.return_value + + def test_init_creates_client(self, mock_httpx_client): + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + assert writer._endpoint == "http://localhost:8080" + assert writer._tag_prefix == "dstack" + + def test_write_posts_records(self, mock_httpx_client): + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + records = [ + {"message": "Hello", "@timestamp": "2023-10-06T10:00:00+00:00"}, + {"message": "World", "@timestamp": "2023-10-06T10:00:01+00:00"}, + ] + writer.write(tag="test-tag", records=records) + + assert mock_httpx_client.post.call_count == 2 + mock_httpx_client.post.assert_any_call( + "http://localhost:8080/dstack.test-tag", + json=records[0], + headers={"Content-Type": "application/json"}, + ) + mock_httpx_client.post.assert_any_call( + "http://localhost:8080/dstack.test-tag", + json=records[1], + headers={"Content-Type": "application/json"}, + ) + + def test_write_calls_raise_for_status(self, mock_httpx_client): + """Test that response.raise_for_status() is called to detect non-2xx responses.""" + mock_response = Mock() + mock_httpx_client.post.return_value = mock_response + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + + writer.write(tag="test-tag", records=[{"message": "test"}]) + + mock_response.raise_for_status.assert_called_once() + + def test_write_raises_on_http_status_error(self, mock_httpx_client): + """Test that 4xx/5xx responses are properly detected and raise LogStorageError.""" + import httpx + + mock_response = Mock() + mock_response.status_code = 500 + mock_response.text = "Internal Server Error" + mock_httpx_client.post.return_value = mock_response + mock_response.raise_for_status.side_effect = httpx.HTTPStatusError( + "Server Error", request=Mock(), response=mock_response + ) + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + + with pytest.raises(LogStorageError, match="Fluent-bit HTTP error: status 500"): + writer.write(tag="test-tag", records=[{"message": "test"}]) + + def test_write_raises_on_transport_error(self, mock_httpx_client): + import httpx + + mock_httpx_client.post.side_effect = httpx.HTTPError("Connection failed") + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + + with pytest.raises(LogStorageError, match="Fluent-bit HTTP error"): + writer.write(tag="test-tag", records=[{"message": "test"}]) + + def test_close_closes_client(self, mock_httpx_client): + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + writer.close() + mock_httpx_client.close.assert_called_once() + + def test_write_applies_tag_prefix(self, mock_httpx_client): + """Test that tag prefix is applied to tags in HTTP requests.""" + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="dstack") + records = [{"message": "test"}] + writer.write(tag="project/run/job", records=records) + + mock_httpx_client.post.assert_called_once_with( + "http://localhost:8080/dstack.project/run/job", + json=records[0], + headers={"Content-Type": "application/json"}, + ) + + def test_write_with_empty_tag_prefix(self, mock_httpx_client): + """Test that empty tag prefix doesn't break the tag.""" + writer = HTTPFluentBitWriter(host="localhost", port=8080, tag_prefix="") + records = [{"message": "test"}] + writer.write(tag="test-tag", records=records) + + mock_httpx_client.post.assert_called_once_with( + "http://localhost:8080/test-tag", + json=records[0], + headers={"Content-Type": "application/json"}, + ) + + +class TestForwardFluentBitWriter: + """Tests for the ForwardFluentBitWriter.""" + + @pytest.fixture + def mock_fluent_sender(self): + with patch( + "dstack._internal.server.services.logs.fluentbit.fluent_sender.FluentSender" + ) as mock: + mock_instance = Mock() + mock_instance.emit.return_value = True + mock.return_value = mock_instance + yield mock_instance + + def test_init_creates_sender(self, mock_fluent_sender): + with patch( + "dstack._internal.server.services.logs.fluentbit.fluent_sender.FluentSender" + ) as mock: + mock.return_value = mock_fluent_sender + ForwardFluentBitWriter(host="localhost", port=24224, tag_prefix="dstack") + mock.assert_called_once_with("dstack", host="localhost", port=24224) + + def test_write_emits_records(self, mock_fluent_sender): + with patch( + "dstack._internal.server.services.logs.fluentbit.fluent_sender.FluentSender" + ) as mock: + mock.return_value = mock_fluent_sender + writer = ForwardFluentBitWriter(host="localhost", port=24224, tag_prefix="dstack") + + records = [ + {"message": "Hello"}, + {"message": "World"}, + ] + writer.write(tag="test-tag", records=records) + + assert mock_fluent_sender.emit.call_count == 2 + + def test_write_raises_on_emit_failure(self, mock_fluent_sender): + mock_fluent_sender.emit.return_value = False + mock_fluent_sender.last_error = Exception("Connection refused") + + with patch( + "dstack._internal.server.services.logs.fluentbit.fluent_sender.FluentSender" + ) as mock: + mock.return_value = mock_fluent_sender + writer = ForwardFluentBitWriter(host="localhost", port=24224, tag_prefix="dstack") + + with pytest.raises(LogStorageError, match="Fluent-bit Forward error"): + writer.write(tag="test-tag", records=[{"message": "test"}]) + + mock_fluent_sender.clear_last_error.assert_called_once() + + def test_close_closes_sender(self, mock_fluent_sender): + with patch( + "dstack._internal.server.services.logs.fluentbit.fluent_sender.FluentSender" + ) as mock: + mock.return_value = mock_fluent_sender + writer = ForwardFluentBitWriter(host="localhost", port=24224, tag_prefix="dstack") + writer.close() + mock_fluent_sender.close.assert_called_once() + + +class TestFluentBitLogStorage: + """Tests for the FluentBitLogStorage.""" + + @pytest_asyncio.fixture + async def project(self, test_db, session: AsyncSession) -> ProjectModel: + project = await create_project(session=session, name="test-proj") + return project + + @pytest.fixture + def mock_forward_writer(self): + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock_instance = Mock() + mock.return_value = mock_instance + yield mock_instance + + @pytest.fixture + def mock_http_writer(self): + with patch("dstack._internal.server.services.logs.fluentbit.HTTPFluentBitWriter") as mock: + mock_instance = Mock() + mock.return_value = mock_instance + yield mock_instance + + @pytest.fixture + def mock_es_reader(self): + with patch("dstack._internal.server.services.logs.fluentbit.ElasticsearchReader") as mock: + mock_instance = Mock() + mock.return_value = mock_instance + yield mock_instance + + def test_init_with_forward_protocol(self, mock_forward_writer): + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + mock.assert_called_once_with(host="localhost", port=24224, tag_prefix="dstack") + assert isinstance(storage._reader, NullLogReader) + + def test_init_with_http_protocol(self, mock_http_writer): + with patch("dstack._internal.server.services.logs.fluentbit.HTTPFluentBitWriter") as mock: + mock.return_value = mock_http_writer + FluentBitLogStorage( + host="localhost", + port=8080, + protocol="http", + tag_prefix="dstack", + ) + mock.assert_called_once_with(host="localhost", port=8080, tag_prefix="dstack") + + def test_init_with_unsupported_protocol_raises(self): + with pytest.raises(LogStorageError, match="Unsupported Fluent-bit protocol"): + FluentBitLogStorage( + host="localhost", + port=24224, + protocol="grpc", + tag_prefix="dstack", + ) + + def test_init_ship_only_mode(self, mock_forward_writer): + """Test initialization without Elasticsearch (ship-only mode).""" + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + es_host=None, + ) + assert isinstance(storage._reader, NullLogReader) + + @pytest.mark.skipif(not ELASTICSEARCH_AVAILABLE, reason="elasticsearch not installed") + def test_init_with_elasticsearch(self, mock_forward_writer, mock_es_reader): + """Test initialization with Elasticsearch configured.""" + with ( + patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as writer_mock, + patch( + "dstack._internal.server.services.logs.fluentbit.ElasticsearchReader" + ) as reader_mock, + ): + writer_mock.return_value = mock_forward_writer + reader_mock.return_value = mock_es_reader + + FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + es_host="http://elasticsearch:9200", + es_index="dstack-logs", + es_api_key="test-key", + ) + reader_mock.assert_called_once_with( + host="http://elasticsearch:9200", + index="dstack-logs", + api_key="test-key", + ) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_write_logs(self, test_db, project: ProjectModel, mock_forward_writer): + """Test writing logs to Fluent-bit.""" + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + + storage.write_logs( + project=project, + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[ + RunnerLogEvent(timestamp=1696586513234, message=b"Runner log"), + ], + job_logs=[ + RunnerLogEvent(timestamp=1696586513235, message=b"Job log"), + ], + ) + + assert mock_forward_writer.write.call_count == 2 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_write_logs_empty_logs_not_written( + self, test_db, project: ProjectModel, mock_forward_writer + ): + """Test that empty log lists are not written.""" + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + + storage.write_logs( + project=project, + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + runner_logs=[], + job_logs=[], + ) + + mock_forward_writer.write.assert_not_called() + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_poll_logs_ship_only_mode(self, test_db, project: ProjectModel): + """Test that ship-only mode returns empty logs.""" + with patch("dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter"): + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=100, + ) + result = storage.poll_logs(project, request) + + assert result.logs == [] + assert result.next_token is None + + def test_close_closes_writer_and_reader(self, mock_forward_writer): + """Test that close() closes both writer and reader.""" + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + + storage.close() + + mock_forward_writer.close.assert_called_once() + + def test_close_closes_reader_even_if_writer_fails(self, mock_forward_writer): + """Test that reader is closed even if writer.close() raises an exception.""" + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock_forward_writer.close.side_effect = Exception("Writer close failed") + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + mock_reader = Mock() + storage._reader = mock_reader + + with pytest.raises(Exception, match="Writer close failed"): + storage.close() + + mock_reader.close.assert_called_once() + + def test_get_stream_name(self, mock_forward_writer): + """Test stream name generation.""" + from dstack._internal.core.models.logs import LogProducer + + with patch( + "dstack._internal.server.services.logs.fluentbit.ForwardFluentBitWriter" + ) as mock: + mock.return_value = mock_forward_writer + storage = FluentBitLogStorage( + host="localhost", + port=24224, + protocol="forward", + tag_prefix="dstack", + ) + + stream_name = storage._get_stream_name( + project_name="my-project", + run_name="my-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + producer=LogProducer.JOB, + ) + + assert stream_name == "my-project/my-run/1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e/job" + + +@pytest.mark.skipif( + not FLUENTBIT_AVAILABLE or not ELASTICSEARCH_AVAILABLE, + reason="fluent-logger or elasticsearch not installed", +) +class TestElasticsearchReader: + """Tests for the ElasticsearchReader.""" + + @pytest.fixture + def mock_es_client(self): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock_instance = Mock() + mock_instance.info.return_value = {"version": {"number": "8.0.0"}} + mock_instance.search.return_value = {"hits": {"hits": []}} + mock.return_value = mock_instance + yield mock_instance + + def test_init_verifies_connection(self, mock_es_client): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + mock_es_client.info.assert_called_once() + + def test_init_with_api_key(self, mock_es_client): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + api_key="test-api-key", + ) + mock.assert_called_once_with(hosts=["http://localhost:9200"], api_key="test-api-key") + + def test_init_connection_error_raises(self): + from elasticsearch.exceptions import ConnectionError as ESConnectionError + + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock_instance = Mock() + mock_instance.info.side_effect = ESConnectionError("Connection refused") + mock.return_value = mock_instance + + with pytest.raises(LogStorageError, match="Failed to connect"): + ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + + def test_read_returns_logs(self, mock_es_client): + mock_es_client.search.return_value = { + "hits": { + "hits": [ + { + "_source": { + "@timestamp": "2023-10-06T10:01:53.234000+00:00", + "message": "Hello", + "stream": "test-stream", + }, + "sort": [1696586513234, "doc1"], + }, + { + "_source": { + "@timestamp": "2023-10-06T10:01:53.235000+00:00", + "message": "World", + "stream": "test-stream", + }, + "sort": [1696586513235, "doc2"], + }, + ] + } + } + + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + reader = ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=2, + ) + result = reader.read("test-stream", request) + + assert len(result.logs) == 2 + assert result.logs[0].message == "Hello" + assert result.logs[1].message == "World" + assert result.next_token == "1696586513235:doc2" + + def test_read_with_time_filtering(self, mock_es_client): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + reader = ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + start_time=datetime(2023, 10, 6, 10, 0, 0, tzinfo=timezone.utc), + end_time=datetime(2023, 10, 6, 11, 0, 0, tzinfo=timezone.utc), + limit=100, + ) + reader.read("test-stream", request) + + call_args = mock_es_client.search.call_args + query = call_args.kwargs["query"] + assert "filter" in query["bool"] + assert len(query["bool"]["filter"]) == 2 + + def test_read_descending_order(self, mock_es_client): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + reader = ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + limit=100, + descending=True, + ) + reader.read("test-stream", request) + + call_args = mock_es_client.search.call_args + assert call_args.kwargs["sort"] == [ + {"@timestamp": {"order": "desc"}}, + {"_id": {"order": "desc"}}, + ] + + def test_read_with_next_token(self, mock_es_client): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + reader = ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + next_token="1696586513234:doc1", + limit=100, + ) + reader.read("test-stream", request) + + call_args = mock_es_client.search.call_args + assert call_args.kwargs["search_after"] == ["1696586513234", "doc1"] + + def test_read_with_malformed_next_token_raises_client_error(self, mock_es_client): + """Test that malformed next_token raises ServerClientError (400) instead of IndexError (500).""" + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + reader = ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + + request = PollLogsRequest( + run_name="test-run", + job_submission_id=UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e"), + next_token="invalid_token_no_colon", + limit=100, + ) + with pytest.raises(ServerClientError, match="Invalid next_token"): + reader.read("test-stream", request) + + request.next_token = ":" + with pytest.raises(ServerClientError, match="Invalid next_token"): + reader.read("test-stream", request) + + request.next_token = ":doc1" + with pytest.raises(ServerClientError, match="Invalid next_token"): + reader.read("test-stream", request) + + request.next_token = "1696586513234:" + with pytest.raises(ServerClientError, match="Invalid next_token"): + reader.read("test-stream", request) + + mock_es_client.search.assert_not_called() + + def test_close_closes_client(self, mock_es_client): + with patch("dstack._internal.server.services.logs.fluentbit.Elasticsearch") as mock: + mock.return_value = mock_es_client + reader = ElasticsearchReader( + host="http://localhost:9200", + index="dstack-logs", + ) + reader.close() + mock_es_client.close.assert_called_once() From 29076ba178c97096b88ac07159b4f96171931453 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Tue, 20 Jan 2026 13:07:22 +0500 Subject: [PATCH 060/187] Adjust fluent-bit logging integration (#3478) * Move import to the top * Fix double error logging * Add missing backticks --- docs/docs/guides/server-deployment.md | 6 +++--- .../_internal/server/services/logs/__init__.py | 11 ++++++++--- .../_internal/server/services/logs/fluentbit.py | 13 ++----------- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/docs/docs/guides/server-deployment.md b/docs/docs/guides/server-deployment.md index dc5093f2f2..42365452aa 100644 --- a/docs/docs/guides/server-deployment.md +++ b/docs/docs/guides/server-deployment.md @@ -229,8 +229,8 @@ Fluent-bit supports two modes depending on how you want to access logs. === "Full mode" - Logs are shipped to Fluent-bit and can be read back through the dstack UI and CLI via Elasticsearch or OpenSearch. - Use this mode when you want a complete integration with log viewing in dstack: + Logs are shipped to Fluent-bit and can be read back through the `dstack` UI and CLI via Elasticsearch or OpenSearch. + Use this mode when you want a complete integration with log viewing in `dstack`: ```shell $ DSTACK_SERVER_FLUENTBIT_HOST=fluentbit.example.com \ @@ -244,7 +244,7 @@ Fluent-bit supports two modes depending on how you want to access logs. The dstack UI/CLI will show empty logs. Use this mode when: - You have an existing logging infrastructure (Kibana, Grafana, Datadog, etc.) - - You only need to forward logs without reading them back through dstack + - You only need to forward logs without reading them back through `dstack` - You want to reduce operational complexity by not running Elasticsearch/OpenSearch ```shell diff --git a/src/dstack/_internal/server/services/logs/__init__.py b/src/dstack/_internal/server/services/logs/__init__.py index 1f8565d49c..bc601688bc 100644 --- a/src/dstack/_internal/server/services/logs/__init__.py +++ b/src/dstack/_internal/server/services/logs/__init__.py @@ -2,6 +2,7 @@ from typing import List, Optional from uuid import UUID +from dstack._internal.core.errors import ServerClientError from dstack._internal.core.models.logs import JobSubmissionLogs from dstack._internal.server import settings from dstack._internal.server.models import ProjectModel @@ -105,9 +106,13 @@ def write_logs( async def poll_logs_async(project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs: - job_submission_logs = await run_async( - get_log_storage().poll_logs, project=project, request=request - ) + try: + job_submission_logs = await run_async( + get_log_storage().poll_logs, project=project, request=request + ) + except LogStorageError as e: + logger.error("Failed to poll logs from log storage: %s", repr(e)) + raise ServerClientError("Failed to poll logs from log storage") # Logs are stored in plaintext but transmitted in base64 for API/CLI backward compatibility. # Old logs stored in base64 are encoded twice for transmission and shown as base64 in CLI/UI. # We live with that. diff --git a/src/dstack/_internal/server/services/logs/fluentbit.py b/src/dstack/_internal/server/services/logs/fluentbit.py index b45b2988d5..bb97e21f09 100644 --- a/src/dstack/_internal/server/services/logs/fluentbit.py +++ b/src/dstack/_internal/server/services/logs/fluentbit.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import List, Optional, Protocol from uuid import UUID @@ -99,7 +100,6 @@ def read( try: response = self._client.search(**search_params) except ElasticsearchError as e: - logger.error("Elasticsearch/OpenSearch search error: %s", e) raise LogStorageError(f"Elasticsearch/OpenSearch error: {e}") from e hits = response.get("hits", {}).get("hits", []) @@ -112,8 +112,6 @@ def read( message = source.get("message", "") if timestamp_str: - from datetime import datetime - try: timestamp = datetime.fromisoformat(timestamp_str.replace("Z", "+00:00")) except ValueError: @@ -180,16 +178,10 @@ def write(self, tag: str, records: List[dict]) -> None: ) response.raise_for_status() except httpx.HTTPStatusError as e: - logger.error( - "Fluent-bit HTTP request failed with status %d: %s", - e.response.status_code, - e.response.text, - ) raise LogStorageError( f"Fluent-bit HTTP error: status {e.response.status_code}" ) from e except httpx.HTTPError as e: - logger.error("Failed to write log to Fluent-bit via HTTP: %s", e) raise LogStorageError(f"Fluent-bit HTTP error: {e}") from e def close(self) -> None: @@ -206,7 +198,6 @@ def write(self, tag: str, records: List[dict]) -> None: for record in records: if not self._sender.emit(tag, record): error = self._sender.last_error - logger.error("Failed to write log to Fluent-bit via Forward: %s", error) self._sender.clear_last_error() raise LogStorageError(f"Fluent-bit Forward error: {error}") @@ -271,7 +262,7 @@ def __init__( index=es_index, api_key=es_api_key, ) - logger.debug( + logger.info( "Fluent-bit log storage initialized with Elasticsearch/OpenSearch reader" ) else: From 54d2d0aa09a54e980529e9c4464a0d1d14db48d2 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Tue, 20 Jan 2026 10:03:55 +0000 Subject: [PATCH 061/187] Emit events for instance status changes (#3477) - Emit an event on every instance status change - To make events more informative, set termination reasons whenever terminating instances - Add `terminated_by_user` termination reason - Remove redundant logging now covered by events - Refactor runtime-only status changes that were not persisted and did not affect logic - For event readability, only include the busy blocks count in job assigned/unassigned events, which is the only place where the count can change --- src/dstack/_internal/core/models/instances.py | 1 + .../tasks/process_compute_groups.py | 9 +- .../server/background/tasks/process_fleets.py | 11 +- .../background/tasks/process_instances.py | 208 +++++------------- .../tasks/process_submitted_jobs.py | 9 +- src/dstack/_internal/server/models.py | 1 + .../_internal/server/services/fleets.py | 33 ++- .../_internal/server/services/instances.py | 56 ++++- .../server/services/jobs/__init__.py | 27 ++- .../_internal/server/routers/test_fleets.py | 30 ++- .../server/services/test_instances.py | 49 ++++- 11 files changed, 214 insertions(+), 220 deletions(-) diff --git a/src/dstack/_internal/core/models/instances.py b/src/dstack/_internal/core/models/instances.py index bf1696758d..012916f97e 100644 --- a/src/dstack/_internal/core/models/instances.py +++ b/src/dstack/_internal/core/models/instances.py @@ -256,6 +256,7 @@ def finished_statuses(cls) -> List["InstanceStatus"]: class InstanceTerminationReason(str, Enum): + TERMINATED_BY_USER = "terminated_by_user" IDLE_TIMEOUT = "idle_timeout" PROVISIONING_TIMEOUT = "provisioning_timeout" ERROR = "error" diff --git a/src/dstack/_internal/server/background/tasks/process_compute_groups.py b/src/dstack/_internal/server/background/tasks/process_compute_groups.py index 5f7b6820a4..6b449efab4 100644 --- a/src/dstack/_internal/server/background/tasks/process_compute_groups.py +++ b/src/dstack/_internal/server/background/tasks/process_compute_groups.py @@ -17,6 +17,7 @@ ) from dstack._internal.server.services import backends as backends_services from dstack._internal.server.services.compute_groups import compute_group_model_to_compute_group +from dstack._internal.server.services.instances import switch_instance_status from dstack._internal.server.services.locking import get_locker from dstack._internal.server.utils import sentry_utils from dstack._internal.utils.common import get_current_datetime, run_async @@ -83,12 +84,14 @@ async def _process_compute_group(session: AsyncSession, compute_group_model: Com ) compute_group_model = res.unique().scalar_one() if all(i.status == InstanceStatus.TERMINATING for i in compute_group_model.instances): - await _terminate_compute_group(compute_group_model) + await _terminate_compute_group(session, compute_group_model) compute_group_model.last_processed_at = get_current_datetime() await session.commit() -async def _terminate_compute_group(compute_group_model: ComputeGroupModel) -> None: +async def _terminate_compute_group( + session: AsyncSession, compute_group_model: ComputeGroupModel +) -> None: if ( compute_group_model.last_termination_retry_at is not None and _next_termination_retry_at(compute_group_model) > get_current_datetime() @@ -147,7 +150,7 @@ async def _terminate_compute_group(compute_group_model: ComputeGroupModel) -> No instance_model.deleted = True instance_model.deleted_at = get_current_datetime() instance_model.finished_at = get_current_datetime() - instance_model.status = InstanceStatus.TERMINATED + switch_instance_status(session, instance_model, InstanceStatus.TERMINATED) logger.info( "Terminated compute group %s", compute_group.name, diff --git a/src/dstack/_internal/server/background/tasks/process_fleets.py b/src/dstack/_internal/server/background/tasks/process_fleets.py index d369c7d242..50c3dcfe2a 100644 --- a/src/dstack/_internal/server/background/tasks/process_fleets.py +++ b/src/dstack/_internal/server/background/tasks/process_fleets.py @@ -26,7 +26,7 @@ is_fleet_in_use, switch_fleet_status, ) -from dstack._internal.server.services.instances import format_instance_status_for_event +from dstack._internal.server.services.instances import switch_instance_status from dstack._internal.server.services.locking import get_locker from dstack._internal.server.utils import sentry_utils from dstack._internal.utils.common import get_current_datetime @@ -219,15 +219,10 @@ def _maintain_fleet_nodes_in_min_max_range( if nodes_redundant == 0: break if instance.status in [InstanceStatus.IDLE]: - instance.status = InstanceStatus.TERMINATING instance.termination_reason = InstanceTerminationReason.MAX_INSTANCES_LIMIT instance.termination_reason_message = "Fleet has too many instances" + switch_instance_status(session, instance, InstanceStatus.TERMINATING) nodes_redundant -= 1 - logger.info( - "Terminating instance %s: %s", - instance.name, - instance.termination_reason, - ) return True nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num for i in range(nodes_missing): @@ -243,7 +238,7 @@ def _maintain_fleet_nodes_in_min_max_range( session, ( "Instance created to meet target fleet node count." - f" Status: {format_instance_status_for_event(instance_model)}" + f" Status: {instance_model.status.upper()}" ), actor=events.SystemActor(), targets=[events.Target.from_model(instance_model)], diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index 454d6ee18a..c2bc27ee85 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -87,6 +87,7 @@ get_instance_requirements, get_instance_ssh_private_keys, remove_dangling_tasks_from_instance, + switch_instance_status, ) from dstack._internal.server.services.locking import get_locker from dstack._internal.server.services.logging import fmt @@ -242,7 +243,7 @@ async def _process_instance(session: AsyncSession, instance: InstanceModel): if instance.status == InstanceStatus.PENDING: if instance.remote_connection_info is not None: - await _add_remote(instance) + await _add_remote(session, instance) else: await _create_instance( session=session, @@ -253,17 +254,21 @@ async def _process_instance(session: AsyncSession, instance: InstanceModel): InstanceStatus.IDLE, InstanceStatus.BUSY, ): - idle_duration_expired = _check_and_mark_terminating_if_idle_duration_expired(instance) + idle_duration_expired = _check_and_mark_terminating_if_idle_duration_expired( + session, instance + ) if not idle_duration_expired: await _check_instance(session, instance) elif instance.status == InstanceStatus.TERMINATING: - await _terminate(instance) + await _terminate(session, instance) instance.last_processed_at = get_current_datetime() await session.commit() -def _check_and_mark_terminating_if_idle_duration_expired(instance: InstanceModel): +def _check_and_mark_terminating_if_idle_duration_expired( + session: AsyncSession, instance: InstanceModel +): if not ( instance.status == InstanceStatus.IDLE and instance.termination_policy == TerminationPolicy.DESTROY_AFTER_IDLE @@ -282,17 +287,9 @@ def _check_and_mark_terminating_if_idle_duration_expired(instance: InstanceModel idle_seconds = instance.termination_idle_time delta = datetime.timedelta(seconds=idle_seconds) if idle_duration > delta: - instance.status = InstanceStatus.TERMINATING instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT - logger.info( - "Instance %s idle duration expired: idle time %ss. Terminating", - instance.name, - str(idle_duration.seconds), - extra={ - "instance_name": instance.name, - "instance_status": instance.status.value, - }, - ) + instance.termination_reason_message = f"Instance idle for {idle_duration.seconds}s" + switch_instance_status(session, instance, InstanceStatus.TERMINATING) return True return False @@ -311,24 +308,16 @@ def _can_terminate_fleet_instances_on_idle_duration(fleet_model: FleetModel) -> return active_instances_num > fleet.spec.configuration.nodes.min -async def _add_remote(instance: InstanceModel) -> None: +async def _add_remote(session: AsyncSession, instance: InstanceModel) -> None: logger.info("Adding ssh instance %s...", instance.name) - if instance.status == InstanceStatus.PENDING: - instance.status = InstanceStatus.PROVISIONING retry_duration_deadline = instance.created_at + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS) if retry_duration_deadline < get_current_datetime(): - instance.status = InstanceStatus.TERMINATED instance.termination_reason = InstanceTerminationReason.PROVISIONING_TIMEOUT - logger.warning( - "Failed to start instance %s in %d seconds. Terminating...", - instance.name, - PROVISIONING_TIMEOUT_SECONDS, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, + instance.termination_reason_message = ( + f"Failed to add SSH instance in {PROVISIONING_TIMEOUT_SECONDS}s" ) + switch_instance_status(session, instance, InstanceStatus.TERMINATED) return try: @@ -341,17 +330,9 @@ async def _add_remote(instance: InstanceModel) -> None: else: ssh_proxy_pkeys = None except (ValueError, PasswordRequiredException): - instance.status = InstanceStatus.TERMINATED instance.termination_reason = InstanceTerminationReason.ERROR instance.termination_reason_message = "Unsupported private SSH key type" - logger.warning( - "Failed to add instance %s: unsupported private SSH key type", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) + switch_instance_status(session, instance, InstanceStatus.TERMINATED) return authorized_keys = [pk.public.strip() for pk in remote_details.ssh_keys] @@ -368,19 +349,13 @@ async def _add_remote(instance: InstanceModel) -> None: raise ProvisioningError(f"Deploy timeout: {e}") from e except Exception as e: raise ProvisioningError(f"Deploy instance raised an error: {e}") from e - else: - logger.info( - "The instance %s (%s) was successfully added", - instance.name, - remote_details.host, - ) except ProvisioningError as e: logger.warning( "Provisioning instance %s could not be completed because of the error: %s", instance.name, e, ) - instance.status = InstanceStatus.PENDING + # Stays in PENDING, may retry later return instance_type = host_info_to_instance_type(host_info, arch) @@ -400,35 +375,19 @@ async def _add_remote(instance: InstanceModel) -> None: addresses=host_network_addresses, ) if instance_network is not None and internal_ip is None: - instance.status = InstanceStatus.TERMINATED instance.termination_reason = InstanceTerminationReason.ERROR instance.termination_reason_message = ( "Failed to locate internal IP address on the given network" ) - logger.warning( - "Failed to add instance %s: failed to locate internal IP address on the given network", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) + switch_instance_status(session, instance, InstanceStatus.TERMINATED) return if internal_ip is not None: if not is_ip_among_addresses(ip_address=internal_ip, addresses=host_network_addresses): - instance.status = InstanceStatus.TERMINATED instance.termination_reason = InstanceTerminationReason.ERROR instance.termination_reason_message = ( "Specified internal IP not found among instance interfaces" ) - logger.warning( - "Failed to add instance %s: specified internal IP not found among instance interfaces", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) + switch_instance_status(session, instance, InstanceStatus.TERMINATED) return divisible, blocks = is_divisible_into_blocks( @@ -439,17 +398,9 @@ async def _add_remote(instance: InstanceModel) -> None: if divisible: instance.total_blocks = blocks else: - instance.status = InstanceStatus.TERMINATED instance.termination_reason = InstanceTerminationReason.ERROR instance.termination_reason_message = "Cannot split into blocks" - logger.warning( - "Failed to add instance %s: cannot split into blocks", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) + switch_instance_status(session, instance, InstanceStatus.TERMINATED) return region = instance.region @@ -470,7 +421,9 @@ async def _add_remote(instance: InstanceModel) -> None: ssh_proxy=remote_details.ssh_proxy, ) - instance.status = InstanceStatus.IDLE if health else InstanceStatus.PROVISIONING + switch_instance_status( + session, instance, InstanceStatus.IDLE if health else InstanceStatus.PROVISIONING + ) instance.backend = BackendType.REMOTE instance_offer = InstanceOfferWithAvailability( backend=BackendType.REMOTE, @@ -562,18 +515,13 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No profile = get_instance_profile(instance) requirements = get_instance_requirements(instance) except ValidationError as e: - instance.status = InstanceStatus.TERMINATED instance.termination_reason = InstanceTerminationReason.ERROR instance.termination_reason_message = ( f"Error to parse profile, requirements or instance_configuration: {e}" ) - logger.warning( - "Error to parse profile, requirements or instance_configuration. Terminate instance: %s", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, + switch_instance_status(session, instance, InstanceStatus.TERMINATED) + logger.exception( + "%s: error parsing profile, requirements or instance configuration", fmt(instance) ) return @@ -664,7 +612,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No ) continue - instance.status = InstanceStatus.PROVISIONING + switch_instance_status(session, instance, InstanceStatus.PROVISIONING) instance.backend = backend.TYPE instance.region = instance_offer.region instance.price = instance_offer.price @@ -674,14 +622,6 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No instance.total_blocks = instance_offer.total_blocks instance.started_at = get_current_datetime() - logger.info( - "Created instance %s", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.PROVISIONING.value, - }, - ) if instance.fleet_id and instance.id == master_instance.id: # Clean up placement groups that did not end up being used. # Flush to update still uncommitted placement groups. @@ -695,18 +635,17 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No ) return - _mark_terminated( - instance, - InstanceTerminationReason.NO_OFFERS, - "All offers failed" if offers else "No offers found", - ) + instance.termination_reason = InstanceTerminationReason.NO_OFFERS + instance.termination_reason_message = "All offers failed" if offers else "No offers found" + switch_instance_status(session, instance, InstanceStatus.TERMINATED) if instance.fleet and instance.id == master_instance.id and is_cloud_cluster(instance.fleet): # Do not attempt to deploy other instances, as they won't determine the correct cluster # backend, region, and placement group without a successfully deployed master instance for sibling_instance in instance.fleet.instances: if sibling_instance.id == instance.id: continue - _mark_terminated(sibling_instance, InstanceTerminationReason.MASTER_FAILED) + sibling_instance.termination_reason = InstanceTerminationReason.MASTER_FAILED + switch_instance_status(session, sibling_instance, InstanceStatus.TERMINATED) async def _get_fleet_master_instance( @@ -723,25 +662,6 @@ async def _get_fleet_master_instance( return res.scalar_one() -def _mark_terminated( - instance: InstanceModel, - termination_reason: InstanceTerminationReason, - termination_reason_message: Optional[str] = None, -) -> None: - instance.status = InstanceStatus.TERMINATED - instance.termination_reason = termination_reason - instance.termination_reason_message = termination_reason_message - logger.info( - "Terminated instance %s: %s", - instance.name, - instance.termination_reason, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) - - async def _check_instance(session: AsyncSession, instance: InstanceModel) -> None: if ( instance.status == InstanceStatus.BUSY @@ -749,9 +669,9 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non and all(job.status.is_finished() for job in instance.jobs) ): # A busy instance could have no active jobs due to this bug: https://github.com/dstackai/dstack/issues/2068 - instance.status = InstanceStatus.TERMINATING instance.termination_reason = InstanceTerminationReason.JOB_FINISHED - logger.info( + switch_instance_status(session, instance, InstanceStatus.TERMINATING) + logger.warning( "Detected busy instance %s with finished job. Marked as TERMINATING", instance.name, extra={ @@ -770,6 +690,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non ) project = res.unique().scalar_one() await _wait_for_instance_provisioning_data( + session=session, project=project, instance=instance, job_provisioning_data=job_provisioning_data, @@ -778,7 +699,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non if not job_provisioning_data.dockerized: if instance.status == InstanceStatus.PROVISIONING: - instance.status = InstanceStatus.BUSY + switch_instance_status(session, instance, InstanceStatus.BUSY) return ssh_private_keys = get_instance_ssh_private_keys(instance) @@ -845,15 +766,10 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non instance.termination_deadline = None if instance.status == InstanceStatus.PROVISIONING: - instance.status = InstanceStatus.IDLE if not instance.jobs else InstanceStatus.BUSY - logger.info( - "Instance %s has switched to %s status", - instance.name, - instance.status.value, - extra={ - "instance_name": instance.name, - "instance_status": instance.status.value, - }, + switch_instance_status( + session, + instance, + InstanceStatus.IDLE if not instance.jobs else InstanceStatus.BUSY, ) return @@ -866,31 +782,18 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non job_provisioning_data=job_provisioning_data, ) if get_current_datetime() > provisioning_deadline: - instance.status = InstanceStatus.TERMINATING - logger.warning( - "Instance %s has not started in time. Marked as TERMINATING", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATING.value, - }, - ) + instance.termination_reason = InstanceTerminationReason.PROVISIONING_TIMEOUT + instance.termination_reason_message = "Instance did not become reachable in time" + switch_instance_status(session, instance, InstanceStatus.TERMINATING) elif instance.status.is_available(): deadline = instance.termination_deadline if get_current_datetime() > deadline: - instance.status = InstanceStatus.TERMINATING instance.termination_reason = InstanceTerminationReason.UNREACHABLE - logger.warning( - "Instance %s shim waiting timeout. Marked as TERMINATING", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATING.value, - }, - ) + switch_instance_status(session, instance, InstanceStatus.TERMINATING) async def _wait_for_instance_provisioning_data( + session: AsyncSession, project: ProjectModel, instance: InstanceModel, job_provisioning_data: JobProvisioningData, @@ -904,12 +807,9 @@ async def _wait_for_instance_provisioning_data( job_provisioning_data=job_provisioning_data, ) if get_current_datetime() > provisioning_deadline: - logger.warning( - "Instance %s failed because instance has not become running in time", instance.name - ) - instance.status = InstanceStatus.TERMINATING instance.termination_reason = InstanceTerminationReason.PROVISIONING_TIMEOUT instance.termination_reason_message = "Backend did not complete provisioning in time" + switch_instance_status(session, instance, InstanceStatus.TERMINATING) return backend = await backends_services.get_project_backend_by_type( @@ -921,9 +821,9 @@ async def _wait_for_instance_provisioning_data( "Instance %s failed because instance's backend is not available", instance.name, ) - instance.status = InstanceStatus.TERMINATING instance.termination_reason = InstanceTerminationReason.ERROR instance.termination_reason_message = "Backend not available" + switch_instance_status(session, instance, InstanceStatus.TERMINATING) return try: await run_async( @@ -939,9 +839,9 @@ async def _wait_for_instance_provisioning_data( instance.name, repr(e), ) - instance.status = InstanceStatus.TERMINATING instance.termination_reason = InstanceTerminationReason.ERROR instance.termination_reason_message = "Error while waiting for instance to become running" + switch_instance_status(session, instance, InstanceStatus.TERMINATING) except Exception: logger.exception( "Got exception when updating instance %s provisioning data", instance.name @@ -1137,7 +1037,7 @@ def _get_instance_cpu_arch(instance: InstanceModel) -> Optional[gpuhunt.CPUArchi return jpd.instance_type.resources.cpu_arch -async def _terminate(instance: InstanceModel) -> None: +async def _terminate(session: AsyncSession, instance: InstanceModel) -> None: if ( instance.last_termination_retry_at is not None and _next_termination_retry_at(instance) > get_current_datetime() @@ -1190,15 +1090,7 @@ async def _terminate(instance: InstanceModel) -> None: instance.deleted = True instance.deleted_at = get_current_datetime() instance.finished_at = get_current_datetime() - instance.status = InstanceStatus.TERMINATED - logger.info( - "Instance %s terminated", - instance.name, - extra={ - "instance_name": instance.name, - "instance_status": InstanceStatus.TERMINATED.value, - }, - ) + switch_instance_status(session, instance, InstanceStatus.TERMINATED) def _next_termination_retry_at(instance: InstanceModel) -> datetime.datetime: diff --git a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py index e132f83a49..2320394436 100644 --- a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py @@ -87,8 +87,9 @@ is_cloud_cluster, ) from dstack._internal.server.services.instances import ( - format_instance_status_for_event, + format_instance_blocks_for_event, get_instance_provisioning_data, + switch_instance_status, ) from dstack._internal.server.services.jobs import ( check_can_attach_job_volumes, @@ -507,7 +508,7 @@ async def _process_submitted_job( session.add(instance) events.emit( session, - f"Instance created for job. Instance status: {format_instance_status_for_event(instance)}", + f"Instance created for job. Instance status: {instance.status.upper()}", actor=events.SystemActor(), targets=[ events.Target.from_model(instance), @@ -646,7 +647,7 @@ async def _assign_job_to_fleet_instance( .options(joinedload(InstanceModel.volume_attachments)) ) instance = res.unique().scalar_one() - instance.status = InstanceStatus.BUSY + switch_instance_status(session, instance, InstanceStatus.BUSY) instance.busy_blocks += offer.blocks job_model.instance = instance @@ -657,7 +658,7 @@ async def _assign_job_to_fleet_instance( session, ( "Job assigned to instance." - f" Instance status: {format_instance_status_for_event(instance)}" + f" Instance blocks: {format_instance_blocks_for_event(instance)}" ), actor=events.SystemActor(), targets=[ diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index 5274d9ebfd..6a8aa41eb4 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -632,6 +632,7 @@ class InstanceModel(BaseModel): compute_group_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("compute_groups.id")) compute_group: Mapped[Optional["ComputeGroupModel"]] = relationship(back_populates="instances") + # NOTE: `status` must be changed only via `switch_instance_status()` status: Mapped[InstanceStatus] = mapped_column(EnumAsString(InstanceStatus, 100), index=True) unreachable: Mapped[bool] = mapped_column(Boolean) diff --git a/src/dstack/_internal/server/services/fleets.py b/src/dstack/_internal/server/services/fleets.py index 19e4a77e64..9b877475f7 100644 --- a/src/dstack/_internal/server/services/fleets.py +++ b/src/dstack/_internal/server/services/fleets.py @@ -31,6 +31,7 @@ from dstack._internal.core.models.instances import ( InstanceOfferWithAvailability, InstanceStatus, + InstanceTerminationReason, RemoteConnectionInfo, SSHConnectionParams, SSHKey, @@ -65,9 +66,9 @@ from dstack._internal.server.services import instances as instances_services from dstack._internal.server.services import offers as offers_services from dstack._internal.server.services.instances import ( - format_instance_status_for_event, get_instance_remote_connection_info, list_active_remote_instances, + switch_instance_status, ) from dstack._internal.server.services.locking import ( get_locker, @@ -679,7 +680,9 @@ async def delete_fleets( "Deleting fleets %s instances %s", [f.name for f in fleet_models], instance_nums ) for fleet_model in fleet_models: - _terminate_fleet_instances(fleet_model=fleet_model, instance_nums=instance_nums) + _terminate_fleet_instances( + session=session, fleet_model=fleet_model, instance_nums=instance_nums, actor=user + ) # TERMINATING fleets are deleted by process_fleets after instances are terminated if instance_nums is None: switch_fleet_status( @@ -873,7 +876,7 @@ async def _create_fleet( session, ( "Instance created on fleet submission." - f" Status: {format_instance_status_for_event(instance_model)}" + f" Status: {instance_model.status.upper()}" ), actor=events.UserActor.from_user(user), targets=[events.Target.from_model(instance_model)], @@ -892,7 +895,7 @@ async def _create_fleet( session, ( "Instance created on fleet submission." - f" Status: {format_instance_status_for_event(instance_model)}" + f" Status: {instance_model.status.upper()}" ), # Set `SystemActor` for consistency with other places where cloud instances can be # created (fleet spec consolidation, job provisioning, etc). Think of the fleet as being @@ -978,17 +981,14 @@ async def _update_fleet( ) events.emit( session, - ( - "Instance created on fleet update." - f" Status: {format_instance_status_for_event(instance_model)}" - ), + f"Instance created on fleet update. Status: {instance_model.status.upper()}", actor=events.UserActor.from_user(user), targets=[events.Target.from_model(instance_model)], ) fleet_model.instances.append(instance_model) active_instance_nums.add(instance_num) if removed_instance_nums: - _terminate_fleet_instances(fleet_model, removed_instance_nums) + _terminate_fleet_instances(session, fleet_model, removed_instance_nums, actor=user) await session.commit() return fleet_model_to_fleet(fleet_model) @@ -1197,7 +1197,12 @@ def _get_fleet_nodes_to_provision(spec: FleetSpec) -> int: return spec.configuration.nodes.target -def _terminate_fleet_instances(fleet_model: FleetModel, instance_nums: Optional[List[int]]): +def _terminate_fleet_instances( + session: AsyncSession, + fleet_model: FleetModel, + instance_nums: Optional[List[int]], + actor: UserModel, +): if is_fleet_in_use(fleet_model, instance_nums=instance_nums): if instance_nums is not None: raise ServerClientError( @@ -1210,4 +1215,10 @@ def _terminate_fleet_instances(fleet_model: FleetModel, instance_nums: Optional[ if instance.status == InstanceStatus.TERMINATED: instance.deleted = True else: - instance.status = InstanceStatus.TERMINATING + instance.termination_reason = InstanceTerminationReason.TERMINATED_BY_USER + switch_instance_status( + session, + instance, + InstanceStatus.TERMINATING, + actor=events.UserActor.from_user(actor), + ) diff --git a/src/dstack/_internal/server/services/instances.py b/src/dstack/_internal/server/services/instances.py index bf837469d0..14f26cc3f0 100644 --- a/src/dstack/_internal/server/services/instances.py +++ b/src/dstack/_internal/server/services/instances.py @@ -25,6 +25,7 @@ InstanceOffer, InstanceOfferWithAvailability, InstanceStatus, + InstanceTerminationReason, InstanceType, RemoteConnectionInfo, Resources, @@ -49,6 +50,7 @@ ) from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse from dstack._internal.server.schemas.runner import InstanceHealthResponse, TaskStatus +from dstack._internal.server.services import events from dstack._internal.server.services.logging import fmt from dstack._internal.server.services.offers import generate_shared_offer from dstack._internal.server.services.projects import list_user_project_models @@ -59,11 +61,55 @@ logger = get_logger(__name__) -def format_instance_status_for_event(instance_model: InstanceModel) -> str: - msg = instance_model.status.upper() - if instance_model.total_blocks is not None: - msg += f" ({instance_model.busy_blocks}/{instance_model.total_blocks} blocks busy)" - return msg +def switch_instance_status( + session: AsyncSession, + instance_model: InstanceModel, + new_status: InstanceStatus, + actor: events.AnyActor = events.SystemActor(), +): + """ + Switch instance status. + + **Usage notes**: + + - When switching to `TERMINATING` or `TERMINATED`, + `instance_model.termination_reason` must be set + + - When `instance_model.termination_reason` is set to `ERROR`, + the error must be further explained in `instance_model.termination_reason_message` + """ + + old_status = instance_model.status + if old_status == new_status: + return + + instance_model.status = new_status + + msg = f"Instance status changed {old_status.upper()} -> {new_status.upper()}" + if ( + new_status == InstanceStatus.TERMINATING + or new_status == InstanceStatus.TERMINATED + and old_status != InstanceStatus.TERMINATING + ): + if instance_model.termination_reason is None: + raise ValueError( + f"termination_reason must be set when switching to {new_status.upper()} status" + ) + if ( + instance_model.termination_reason == InstanceTerminationReason.ERROR + and not instance_model.termination_reason_message + ): + raise ValueError( + "termination_reason_message must be set when termination_reason is ERROR" + ) + msg += f". Termination reason: {instance_model.termination_reason.upper()}" + if instance_model.termination_reason_message: + msg += f" ({instance_model.termination_reason_message})" + events.emit(session, msg, actor=actor, targets=[events.Target.from_model(instance_model)]) + + +def format_instance_blocks_for_event(instance_model: InstanceModel) -> str: + return f"{instance_model.busy_blocks}/{instance_model.total_blocks} busy" async def get_instance_health_checks( diff --git a/src/dstack/_internal/server/services/jobs/__init__.py b/src/dstack/_internal/server/services/jobs/__init__.py index b86bd09643..18d410c133 100644 --- a/src/dstack/_internal/server/services/jobs/__init__.py +++ b/src/dstack/_internal/server/services/jobs/__init__.py @@ -21,7 +21,7 @@ ) from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.configurations import RunConfigurationType -from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason from dstack._internal.core.models.runs import ( Job, JobProvisioningData, @@ -44,8 +44,9 @@ from dstack._internal.server.services import events, services from dstack._internal.server.services import volumes as volumes_services from dstack._internal.server.services.instances import ( - format_instance_status_for_event, + format_instance_blocks_for_event, get_instance_ssh_private_keys, + switch_instance_status, ) from dstack._internal.server.services.jobs.configurators.base import ( JobConfigurator, @@ -352,18 +353,16 @@ async def process_terminating_job( blocks = 1 instance_model.busy_blocks -= blocks - if instance_model.status == InstanceStatus.BUSY: + if instance_model.status != InstanceStatus.BUSY or jpd is None or not jpd.dockerized: + # Terminate instances that: + # - have not finished provisioning yet + # - belong to container-based backends, and hence cannot be reused + if instance_model.status not in InstanceStatus.finished_statuses(): + instance_model.termination_reason = InstanceTerminationReason.JOB_FINISHED + switch_instance_status(session, instance_model, InstanceStatus.TERMINATING) + elif not [j for j in instance_model.jobs if j.id != job_model.id]: # no other jobs besides this one - if not [j for j in instance_model.jobs if j.id != job_model.id]: - instance_model.status = InstanceStatus.IDLE - elif instance_model.status != InstanceStatus.TERMINATED: - # instance was PROVISIONING (specially for the job) - # schedule for termination - instance_model.status = InstanceStatus.TERMINATING - - if jpd is None or not jpd.dockerized: - # do not reuse vastai/k8s instances - instance_model.status = InstanceStatus.TERMINATING + switch_instance_status(session, instance_model, InstanceStatus.IDLE) # The instance should be released even if detach fails # so that stuck volumes don't prevent the instance from terminating. @@ -374,7 +373,7 @@ async def process_terminating_job( session, ( "Job unassigned from instance." - f" Instance status: {format_instance_status_for_event(instance_model)}" + f" Instance blocks: {format_instance_blocks_for_event(instance_model)}" ), actor=events.SystemActor(), targets=[ diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py index afa68b788d..b00d6ccf57 100644 --- a/src/tests/_internal/server/routers/test_fleets.py +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -2,7 +2,7 @@ from datetime import datetime, timezone from typing import Optional from unittest.mock import Mock, patch -from uuid import UUID, uuid4 +from uuid import uuid4 import pytest from freezegun import freeze_time @@ -603,19 +603,17 @@ async def test_updates_ssh_fleet(self, test_db, session: AsyncSession, client: A remote_connection_info=get_remote_connection_info(host="10.0.0.100"), ) - with patch("uuid.uuid4") as m: - m.return_value = UUID("1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") - response = await client.post( - f"/api/project/{project.name}/fleets/apply", - headers=get_auth_headers(user.token), - json={ - "plan": { - "spec": spec.dict(), - "current_resource": _fleet_model_to_json_dict(fleet), - }, - "force": False, + response = await client.post( + f"/api/project/{project.name}/fleets/apply", + headers=get_auth_headers(user.token), + json={ + "plan": { + "spec": spec.dict(), + "current_resource": _fleet_model_to_json_dict(fleet), }, - ) + "force": False, + }, + ) assert response.status_code == 200, response.json() assert response.json() == { @@ -711,7 +709,7 @@ async def test_updates_ssh_fleet(self, test_db, session: AsyncSession, client: A "status": "terminating", "unreachable": False, "health_status": "healthy", - "termination_reason": None, + "termination_reason": "terminated_by_user", "termination_reason_message": None, "created": "2023-01-02T03:04:00+00:00", "region": "remote", @@ -721,7 +719,7 @@ async def test_updates_ssh_fleet(self, test_db, session: AsyncSession, client: A "busy_blocks": 0, }, { - "id": "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", + "id": SomeUUID4Str(), "project_name": project.name, "backend": "remote", "instance_type": { @@ -761,7 +759,7 @@ async def test_updates_ssh_fleet(self, test_db, session: AsyncSession, client: A await session.refresh(instance) assert instance.status == InstanceStatus.TERMINATING res = await session.execute( - select(InstanceModel).where(InstanceModel.id == "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") + select(InstanceModel).where(InstanceModel.id == response.json()["instances"][1]["id"]) ) instance = res.unique().scalar_one() assert instance.status == InstanceStatus.PENDING diff --git a/src/tests/_internal/server/services/test_instances.py b/src/tests/_internal/server/services/test_instances.py index aa248aa485..9e4cb02e3a 100644 --- a/src/tests/_internal/server/services/test_instances.py +++ b/src/tests/_internal/server/services/test_instances.py @@ -1,6 +1,7 @@ import uuid import pytest +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession import dstack._internal.server.services.instances as instances_services @@ -9,11 +10,12 @@ from dstack._internal.core.models.instances import ( Instance, InstanceStatus, + InstanceTerminationReason, InstanceType, Resources, ) from dstack._internal.core.models.profiles import Profile -from dstack._internal.server.models import InstanceModel +from dstack._internal.server.models import EventModel, InstanceModel from dstack._internal.server.testing.common import ( create_instance, create_project, @@ -24,6 +26,51 @@ from dstack._internal.utils.common import get_current_datetime +class TestSwitchInstanceStatus: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_includes_termination_reason_in_event_messages_only_once( + self, test_db, session: AsyncSession + ) -> None: + project = await create_project(session=session) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.PENDING + ) + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = "Some err" + instances_services.switch_instance_status(session, instance, InstanceStatus.TERMINATING) + instances_services.switch_instance_status(session, instance, InstanceStatus.TERMINATED) + + res = await session.execute(select(EventModel)) + events = res.scalars().all() + assert len(events) == 2 + assert {e.message for e in events} == { + "Instance status changed PENDING -> TERMINATING. Termination reason: ERROR (Some err)", + # Do not duplicate the termination reason in the second event + "Instance status changed TERMINATING -> TERMINATED", + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_includes_termination_reason_in_event_message_when_switching_directly_to_terminated( + self, test_db, session: AsyncSession + ) -> None: + project = await create_project(session=session) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.PENDING + ) + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = "Some err" + instances_services.switch_instance_status(session, instance, InstanceStatus.TERMINATED) + + res = await session.execute(select(EventModel)) + events = res.scalars().all() + assert len(events) == 1 + assert events[0].message == ( + "Instance status changed PENDING -> TERMINATED. Termination reason: ERROR (Some err)" + ) + + class TestFilterPoolInstances: # TODO: Refactor filter_pool_instances to not depend on InstanceModel and simplify tests @pytest.mark.asyncio From c01b022a0a6611c272626540234f0f4fe7148fee Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Tue, 20 Jan 2026 11:06:01 +0000 Subject: [PATCH 062/187] [runner] Restore `--home-dir` option as no-op (#3480) Fixes: https://github.com/dstackai/dstack/issues/3474 --- runner/cmd/runner/main.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/runner/cmd/runner/main.go b/runner/cmd/runner/main.go index c2ed94f0eb..c8125dc848 100644 --- a/runner/cmd/runner/main.go +++ b/runner/cmd/runner/main.go @@ -78,6 +78,12 @@ func mainInner() int { Usage: "dstack server or user authorized key. May be specified multiple times", Destination: &sshAuthorizedKeys, }, + // --home-dir is not used since 0.20.4, but the flag was retained as no-op + // for compatibility with pre-0.20.4 shims; remove the flag eventually + &cli.StringFlag{ + Name: "home-dir", + Hidden: true, + }, }, Action: func(ctx context.Context, cmd *cli.Command) error { return start(ctx, tempDir, httpPort, sshPort, sshAuthorizedKeys, logLevel, Version) From 65eacc7796f81aeb3dcff34e64b9e6ab4fa6fa7a Mon Sep 17 00:00:00 2001 From: Oleg Date: Tue, 20 Jan 2026 23:45:48 +0300 Subject: [PATCH 063/187] [UI] Default fleet in project wizard (#3464) * [UI] Default fleet in project wizard #373 * Minor cosmetic changes * Fixes after review * Was added create project wizard for oss * Cosmetical changes + help info * Was added create fleet wizard * Fixes after review * Refactoring after review * Fixes after review * Fixes after review * Cosmetics * Fixes after review --------- Co-authored-by: peterschmidt85 --- frontend/src/api.ts | 1 + .../ButtonWithConfirmation/index.tsx | 20 +- .../components/ConfirmationDialog/index.tsx | 5 +- .../components/ConfirmationDialog/slice.ts | 34 ++ .../components/form/Toogle/index.module.scss | 17 + frontend/src/components/form/Toogle/index.tsx | 78 +++++ frontend/src/components/form/Toogle/types.ts | 13 + frontend/src/components/index.ts | 1 + frontend/src/hooks/index.ts | 1 + frontend/src/hooks/useConfirmationDialog.ts | 27 ++ frontend/src/hooks/useNotifications.ts | 1 + frontend/src/layouts/AppLayout/index.tsx | 7 + frontend/src/locale/en.json | 40 ++- .../Fleets/Add/FleetFormFields/constants.tsx | 115 +++++++ .../Fleets/Add/FleetFormFields/index.tsx | 79 +++++ .../pages/Fleets/Add/FleetFormFields/type.ts | 15 + frontend/src/pages/Fleets/Add/index.tsx | 254 ++++++++++++++ frontend/src/pages/Fleets/Add/types.ts | 5 + frontend/src/pages/Fleets/index.ts | 1 + frontend/src/pages/Project/Add/index.tsx | 311 +++++++++++++++++- .../src/pages/Project/CreateWizard/index.tsx | 297 +++++++++-------- .../src/pages/Project/CreateWizard/types.ts | 9 +- frontend/src/pages/Project/Form/types.ts | 10 +- frontend/src/pages/Project/List/index.tsx | 4 +- .../components/NoFleetProjectAlert/index.tsx | 8 +- frontend/src/pages/Project/constants.tsx | 32 ++ .../Project/hooks/useYupValidationResolver.ts | 38 +++ frontend/src/pages/User/Details/index.tsx | 8 +- frontend/src/pages/User/List/index.tsx | 2 + frontend/src/router.tsx | 6 +- frontend/src/routes.ts | 4 + frontend/src/services/fleet.ts | 20 +- frontend/src/services/project.ts | 2 +- frontend/src/store.ts | 2 + frontend/src/types/fleet.d.ts | 16 +- frontend/src/types/project.d.ts | 4 + 36 files changed, 1282 insertions(+), 205 deletions(-) create mode 100644 frontend/src/components/ConfirmationDialog/slice.ts create mode 100644 frontend/src/components/form/Toogle/index.module.scss create mode 100644 frontend/src/components/form/Toogle/index.tsx create mode 100644 frontend/src/components/form/Toogle/types.ts create mode 100644 frontend/src/hooks/useConfirmationDialog.ts create mode 100644 frontend/src/pages/Fleets/Add/FleetFormFields/constants.tsx create mode 100644 frontend/src/pages/Fleets/Add/FleetFormFields/index.tsx create mode 100644 frontend/src/pages/Fleets/Add/FleetFormFields/type.ts create mode 100644 frontend/src/pages/Fleets/Add/index.tsx create mode 100644 frontend/src/pages/Fleets/Add/types.ts create mode 100644 frontend/src/pages/Project/constants.tsx create mode 100644 frontend/src/pages/Project/hooks/useYupValidationResolver.ts diff --git a/frontend/src/api.ts b/frontend/src/api.ts index d58dbc7d38..144a21bc86 100644 --- a/frontend/src/api.ts +++ b/frontend/src/api.ts @@ -99,6 +99,7 @@ export const API = { // Fleets FLEETS: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/fleets/list`, FLEETS_DETAILS: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/fleets/get`, + FLEETS_APPLY: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/fleets/apply`, FLEETS_DELETE: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/fleets/delete`, FLEET_INSTANCES_DELETE: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/fleets/delete_instances`, diff --git a/frontend/src/components/ButtonWithConfirmation/index.tsx b/frontend/src/components/ButtonWithConfirmation/index.tsx index 78c2793d9c..56ae78ad59 100644 --- a/frontend/src/components/ButtonWithConfirmation/index.tsx +++ b/frontend/src/components/ButtonWithConfirmation/index.tsx @@ -1,4 +1,5 @@ import React, { useState } from 'react'; +import { useTranslation } from 'react-i18next'; import Box from '@cloudscape-design/components/box'; import { Button } from '../Button'; @@ -13,20 +14,31 @@ export const ButtonWithConfirmation: React.FC = ({ confirmButtonLabel, ...props }) => { + const { t } = useTranslation(); const [showDeleteConfirm, setShowConfirmDelete] = useState(false); const toggleDeleteConfirm = () => { setShowConfirmDelete((val) => !val); }; - const content = typeof confirmContent === 'string' ? {confirmContent} : confirmContent; - const onConfirm = () => { if (onClick) onClick(); setShowConfirmDelete(false); }; + const getContent = () => { + if (!confirmContent) { + return {t('confirm_dialog.message')}; + } + + if (typeof confirmContent === 'string') { + return {confirmContent}; + } + + return confirmContent; + }; + return ( <> } + {isAvailableProjectManaging && } ); }; @@ -137,7 +137,7 @@ export const ProjectList: React.FC = () => { {t('common.delete')} - + } diff --git a/frontend/src/pages/Project/constants.tsx b/frontend/src/pages/Project/constants.tsx new file mode 100644 index 0000000000..151740116b --- /dev/null +++ b/frontend/src/pages/Project/constants.tsx @@ -0,0 +1,32 @@ +import React from 'react'; + +export const DEFAULT_FLEET_INFO = { + header:

Default fleet

, + body: ( + <> +

+ Fleets act both as pools of instances and as templates for how those instances are provisioned. When you submit + a dev environment, task, or service, dstack reuses idle instances or provisions new + ones based on the fleet configuration. +

+ +

+ If you set Min number of instances to 0, dstack will provision instances + only when you run a dev environment, task, or service. +

+ +

+ At least one fleet is required to run dev environments, tasks, or services. Create it here, or create it using + the dstack apply command via the CLI. +

+ +

+ To learn more about fleets, see the{' '} + + documentation + + . +

+ + ), +}; diff --git a/frontend/src/pages/Project/hooks/useYupValidationResolver.ts b/frontend/src/pages/Project/hooks/useYupValidationResolver.ts new file mode 100644 index 0000000000..2cd694c63d --- /dev/null +++ b/frontend/src/pages/Project/hooks/useYupValidationResolver.ts @@ -0,0 +1,38 @@ +import { useCallback } from 'react'; +// eslint-disable-next-line @typescript-eslint/ban-ts-comment +// @ts-expect-error +export function useYupValidationResolver(validationSchema) { + return useCallback( + async (data: TData) => { + try { + const values = await validationSchema.validate(data, { + abortEarly: false, + }); + + return { + values, + errors: {}, + }; + } catch (errors) { + return { + values: {}, + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + errors: errors.inner.reduce( + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + (allErrors, currentError) => ({ + ...allErrors, + [currentError.path]: { + type: currentError.type ?? 'validation', + message: currentError.message, + }, + }), + {}, + ), + }; + } + }, + [validationSchema], + ); +} diff --git a/frontend/src/pages/User/Details/index.tsx b/frontend/src/pages/User/Details/index.tsx index 805b2efc98..1ee131094c 100644 --- a/frontend/src/pages/User/Details/index.tsx +++ b/frontend/src/pages/User/Details/index.tsx @@ -95,7 +95,13 @@ export const UserDetails: React.FC = () => { - + {t('confirm_dialog.message')}} + onDiscard={toggleDeleteConfirm} + onConfirm={deleteUserHandler} + confirmButtonLabel={t('common.delete')} + /> ); }; diff --git a/frontend/src/pages/User/List/index.tsx b/frontend/src/pages/User/List/index.tsx index 17bc417e94..5831d03383 100644 --- a/frontend/src/pages/User/List/index.tsx +++ b/frontend/src/pages/User/List/index.tsx @@ -208,8 +208,10 @@ export const UserList: React.FC = () => { {t('confirm_dialog.message')}} onDiscard={toggleDeleteConfirm} onConfirm={deleteSelectedUserHandler} + confirmButtonLabel={t('common.delete')} /> ); diff --git a/frontend/src/router.tsx b/frontend/src/router.tsx index fbdeca2942..34a8abaaf0 100644 --- a/frontend/src/router.tsx +++ b/frontend/src/router.tsx @@ -10,7 +10,7 @@ import { LoginByGoogleCallback } from 'App/Login/LoginByGoogleCallback'; import { LoginByOktaCallback } from 'App/Login/LoginByOktaCallback'; import { TokenLogin } from 'App/Login/TokenLogin'; import { Logout } from 'App/Logout'; -import { FleetDetails, FleetList } from 'pages/Fleets'; +import { FleetAdd, FleetDetails, FleetList } from 'pages/Fleets'; import { EventsList as FleetEventsList } from 'pages/Fleets/Details/Events'; import { FleetDetails as FleetDetailsGeneral } from 'pages/Fleets/Details/FleetDetails'; import { FleetInspect } from 'pages/Fleets/Details/Inspect'; @@ -202,6 +202,10 @@ export const router = createBrowserRouter([ path: ROUTES.FLEETS.LIST, element: , }, + { + path: ROUTES.FLEETS.ADD.TEMPLATE, + element: , + }, { path: ROUTES.FLEETS.DETAILS.TEMPLATE, element: , diff --git a/frontend/src/routes.ts b/frontend/src/routes.ts index fea2f978a4..288cef72fc 100644 --- a/frontend/src/routes.ts +++ b/frontend/src/routes.ts @@ -137,6 +137,10 @@ export const ROUTES = { FLEETS: { LIST: '/fleets', + ADD: { + TEMPLATE: `/projects/:projectName/fleets/add`, + FORMAT: (projectName: string) => buildRoute(ROUTES.FLEETS.ADD.TEMPLATE, { projectName }), + }, DETAILS: { TEMPLATE: `/projects/:projectName/fleets/:fleetId`, FORMAT: (projectName: string, fleetId: string) => diff --git a/frontend/src/services/fleet.ts b/frontend/src/services/fleet.ts index 3405a18b8b..fa723d7d2d 100644 --- a/frontend/src/services/fleet.ts +++ b/frontend/src/services/fleet.ts @@ -66,7 +66,25 @@ export const fleetApi = createApi({ invalidatesTags: ['Fleets'], }), + + applyFleet: builder.mutation({ + query: ({ projectName, ...body }) => { + return { + url: API.PROJECTS.FLEETS_APPLY(projectName), + method: 'POST', + body, + }; + }, + + invalidatesTags: ['Fleets'], + }), }), }); -export const { useGetFleetsQuery, useLazyGetFleetsQuery, useDeleteFleetMutation, useGetFleetDetailsQuery } = fleetApi; +export const { + useGetFleetsQuery, + useLazyGetFleetsQuery, + useDeleteFleetMutation, + useGetFleetDetailsQuery, + useApplyFleetMutation, +} = fleetApi; diff --git a/frontend/src/services/project.ts b/frontend/src/services/project.ts index 2f0a4bd6b5..8875c48df6 100644 --- a/frontend/src/services/project.ts +++ b/frontend/src/services/project.ts @@ -74,7 +74,7 @@ export const projectApi = createApi({ providesTags: (result) => (result ? [{ type: 'Projects' as const, id: result.project_name }] : []), }), - createProject: builder.mutation({ + createProject: builder.mutation({ query: (project) => ({ url: API.PROJECTS.CREATE(), method: 'POST', diff --git a/frontend/src/store.ts b/frontend/src/store.ts index ca19b1206d..03d2c820e7 100644 --- a/frontend/src/store.ts +++ b/frontend/src/store.ts @@ -1,5 +1,6 @@ import { configureStore } from '@reduxjs/toolkit'; +import confirmationReducer from 'components/ConfirmationDialog/slice'; import notificationsReducer from 'components/Notifications/slice'; import { artifactApi } from 'services/artifact'; @@ -25,6 +26,7 @@ export const store = configureStore({ reducer: { app: appReducer, notifications: notificationsReducer, + confirmation: confirmationReducer, [projectApi.reducerPath]: projectApi.reducer, [runApi.reducerPath]: runApi.reducer, [artifactApi.reducerPath]: artifactApi.reducer, diff --git a/frontend/src/types/fleet.d.ts b/frontend/src/types/fleet.d.ts index 892acf41fa..2813cd4023 100644 --- a/frontend/src/types/fleet.d.ts +++ b/frontend/src/types/fleet.d.ts @@ -45,9 +45,12 @@ declare interface IFleetConfigurationRequest { max?: number; }; placement?: 'any' | 'cluster'; + reservation?: string; resources?: IFleetConfigurationResource[]; + blocks?: string | number; backends?: TBackendType[]; regions?: string[]; + availability_zones?: string[]; instance_types?: string[]; spot_policy?: TSpotPolicy; retry?: @@ -76,13 +79,14 @@ declare interface IProfileRequest { instance_name?: string; creation_policy?: 'reuse' | 'reuse-or-create'; idle_duration?: number | string; - name: string; + name?: string; default?: boolean; } declare interface IFleetSpec { - autocreated: boolean; + autocreated?: boolean; configuration: IFleetConfigurationRequest; + configuration_path?: string; profile: IProfileRequest; } @@ -96,3 +100,11 @@ declare interface IFleet { status: 'submitted' | 'active' | 'terminating' | 'terminated' | 'failed'; status_message: string; } + +declare interface IApplyFleetPlanRequestRequest { + plan: { + spec: IFleetSpec; + }; + + force: boolean; +} diff --git a/frontend/src/types/project.d.ts b/frontend/src/types/project.d.ts index cf24c84d03..babb4dab7a 100644 --- a/frontend/src/types/project.d.ts +++ b/frontend/src/types/project.d.ts @@ -46,3 +46,7 @@ declare interface IProjectSecret { name: string; value?: string; } + +declare type IProjectCreateRequestParams = Pick & { + is_public: boolean; +}; From 196459136466fe89ede9c318075b4289507a6375 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Wed, 21 Jan 2026 12:25:40 +0500 Subject: [PATCH 064/187] Support shared AWS compute caches (#3483) * Log get_offers times * Request aws quotas and zones in parallel * Revert "Request aws quotas and zones in parallel" This reverts commit a0f365e4a662087824266e5e5dccd7e7a8028bee. * Add AWSQuotasSharedCache * Refactor compute caches --- .../_internal/core/backends/aws/backend.py | 9 ++- .../_internal/core/backends/aws/compute.py | 78 ++++++++++--------- .../_internal/core/backends/base/compute.py | 15 +++- .../_internal/core/backends/gcp/compute.py | 18 ++--- .../server/services/backends/__init__.py | 13 +++- 5 files changed, 84 insertions(+), 49 deletions(-) diff --git a/src/dstack/_internal/core/backends/aws/backend.py b/src/dstack/_internal/core/backends/aws/backend.py index 3dfd4f4093..1169227cc7 100644 --- a/src/dstack/_internal/core/backends/aws/backend.py +++ b/src/dstack/_internal/core/backends/aws/backend.py @@ -1,3 +1,5 @@ +from typing import Optional + import botocore.exceptions from dstack._internal.core.backends.aws.compute import AWSCompute @@ -11,9 +13,12 @@ class AWSBackend(Backend): TYPE = BackendType.AWS COMPUTE_CLASS = AWSCompute - def __init__(self, config: AWSConfig): + def __init__(self, config: AWSConfig, compute: Optional[AWSCompute] = None): self.config = config - self._compute = AWSCompute(self.config) + if compute is not None: + self._compute = compute + else: + self._compute = AWSCompute(self.config) self._check_credentials() def compute(self) -> AWSCompute: diff --git a/src/dstack/_internal/core/backends/aws/compute.py b/src/dstack/_internal/core/backends/aws/compute.py index 48720bb316..be3133456c 100644 --- a/src/dstack/_internal/core/backends/aws/compute.py +++ b/src/dstack/_internal/core/backends/aws/compute.py @@ -1,6 +1,7 @@ import threading from collections.abc import Iterable from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Optional, Tuple import boto3 @@ -19,6 +20,8 @@ ) from dstack._internal.core.backends.base.compute import ( Compute, + ComputeCache, + ComputeTTLCache, ComputeWithAllOffersCached, ComputeWithCreateInstanceSupport, ComputeWithGatewaySupport, @@ -94,6 +97,11 @@ def _ec2client_cache_methodkey(self, ec2_client, *args, **kwargs): return hashkey(*args, **kwargs) +@dataclass +class AWSQuotasCache(ComputeTTLCache): + execution_lock: threading.Lock = field(default_factory=threading.Lock) + + class AWSCompute( ComputeWithAllOffersCached, ComputeWithCreateInstanceSupport, @@ -106,7 +114,12 @@ class AWSCompute( ComputeWithVolumeSupport, Compute, ): - def __init__(self, config: AWSConfig): + def __init__( + self, + config: AWSConfig, + quotas_cache: Optional[AWSQuotasCache] = None, + zones_cache: Optional[ComputeCache] = None, + ): super().__init__() self.config = config if isinstance(config.creds, AWSAccessKeyCreds): @@ -119,23 +132,18 @@ def __init__(self, config: AWSConfig): # Caches to avoid redundant API calls when provisioning many instances # get_offers is already cached but we still cache its sub-functions # with more aggressive/longer caches. - self._offers_post_filter_cache_lock = threading.Lock() - self._offers_post_filter_cache = TTLCache(maxsize=10, ttl=180) - self._get_regions_to_quotas_cache_lock = threading.Lock() - self._get_regions_to_quotas_execution_lock = threading.Lock() - self._get_regions_to_quotas_cache = TTLCache(maxsize=10, ttl=300) - self._get_regions_to_zones_cache_lock = threading.Lock() - self._get_regions_to_zones_cache = Cache(maxsize=10) - self._get_vpc_id_subnet_id_or_error_cache_lock = threading.Lock() - self._get_vpc_id_subnet_id_or_error_cache = TTLCache(maxsize=100, ttl=600) - self._get_maximum_efa_interfaces_cache_lock = threading.Lock() - self._get_maximum_efa_interfaces_cache = Cache(maxsize=100) - self._get_subnets_availability_zones_cache_lock = threading.Lock() - self._get_subnets_availability_zones_cache = Cache(maxsize=100) - self._create_security_group_cache_lock = threading.Lock() - self._create_security_group_cache = TTLCache(maxsize=100, ttl=600) - self._get_image_id_and_username_cache_lock = threading.Lock() - self._get_image_id_and_username_cache = TTLCache(maxsize=100, ttl=600) + self._offers_post_filter_cache = ComputeTTLCache(cache=TTLCache(maxsize=10, ttl=180)) + if quotas_cache is None: + quotas_cache = AWSQuotasCache(cache=TTLCache(maxsize=10, ttl=600)) + self._regions_to_quotas_cache = quotas_cache + if zones_cache is None: + zones_cache = ComputeCache(cache=Cache(maxsize=10)) + self._regions_to_zones_cache = zones_cache + self._vpc_id_subnet_id_cache = ComputeTTLCache(cache=TTLCache(maxsize=100, ttl=600)) + self._maximum_efa_interfaces_cache = ComputeCache(cache=Cache(maxsize=100)) + self._subnets_availability_zones_cache = ComputeCache(cache=Cache(maxsize=100)) + self._security_group_cache = ComputeTTLCache(cache=TTLCache(maxsize=100, ttl=600)) + self._image_id_and_username_cache = ComputeTTLCache(cache=TTLCache(maxsize=100, ttl=600)) def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: offers = get_catalog_offers( @@ -144,7 +152,7 @@ def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability extra_filter=_supported_instances, ) regions = list(set(i.region for i in offers)) - with self._get_regions_to_quotas_execution_lock: + with self._regions_to_quotas_cache.execution_lock: # Cache lock does not prevent concurrent execution. # We use a separate lock to avoid requesting quotas in parallel and hitting rate limits. regions_to_quotas = self._get_regions_to_quotas(self.session, regions) @@ -173,9 +181,9 @@ def _get_offers_cached_key(self, requirements: Requirements) -> int: return hash(requirements.json()) @cachedmethod( - cache=lambda self: self._offers_post_filter_cache, + cache=lambda self: self._offers_post_filter_cache.cache, key=_get_offers_cached_key, - lock=lambda self: self._offers_post_filter_cache_lock, + lock=lambda self: self._offers_post_filter_cache.lock, ) def get_offers_post_filter( self, requirements: Requirements @@ -789,9 +797,9 @@ def _get_regions_to_quotas_key( return hashkey(tuple(regions)) @cachedmethod( - cache=lambda self: self._get_regions_to_quotas_cache, + cache=lambda self: self._regions_to_quotas_cache.cache, key=_get_regions_to_quotas_key, - lock=lambda self: self._get_regions_to_quotas_cache_lock, + lock=lambda self: self._regions_to_quotas_cache.lock, ) def _get_regions_to_quotas( self, @@ -808,9 +816,9 @@ def _get_regions_to_zones_key( return hashkey(tuple(regions)) @cachedmethod( - cache=lambda self: self._get_regions_to_zones_cache, + cache=lambda self: self._regions_to_zones_cache.cache, key=_get_regions_to_zones_key, - lock=lambda self: self._get_regions_to_zones_cache_lock, + lock=lambda self: self._regions_to_zones_cache.lock, ) def _get_regions_to_zones( self, @@ -832,9 +840,9 @@ def _get_vpc_id_subnet_id_or_error_cache_key( ) @cachedmethod( - cache=lambda self: self._get_vpc_id_subnet_id_or_error_cache, + cache=lambda self: self._vpc_id_subnet_id_cache.cache, key=_get_vpc_id_subnet_id_or_error_cache_key, - lock=lambda self: self._get_vpc_id_subnet_id_or_error_cache_lock, + lock=lambda self: self._vpc_id_subnet_id_cache.lock, ) def _get_vpc_id_subnet_id_or_error( self, @@ -853,9 +861,9 @@ def _get_vpc_id_subnet_id_or_error( ) @cachedmethod( - cache=lambda self: self._get_maximum_efa_interfaces_cache, + cache=lambda self: self._maximum_efa_interfaces_cache.cache, key=_ec2client_cache_methodkey, - lock=lambda self: self._get_maximum_efa_interfaces_cache_lock, + lock=lambda self: self._maximum_efa_interfaces_cache.lock, ) def _get_maximum_efa_interfaces( self, @@ -877,9 +885,9 @@ def _get_subnets_availability_zones_key( return hashkey(region, tuple(subnet_ids)) @cachedmethod( - cache=lambda self: self._get_subnets_availability_zones_cache, + cache=lambda self: self._subnets_availability_zones_cache.cache, key=_get_subnets_availability_zones_key, - lock=lambda self: self._get_subnets_availability_zones_cache_lock, + lock=lambda self: self._subnets_availability_zones_cache.lock, ) def _get_subnets_availability_zones( self, @@ -893,9 +901,9 @@ def _get_subnets_availability_zones( ) @cachedmethod( - cache=lambda self: self._create_security_group_cache, + cache=lambda self: self._security_group_cache.cache, key=_ec2client_cache_methodkey, - lock=lambda self: self._create_security_group_cache_lock, + lock=lambda self: self._security_group_cache.lock, ) def _create_security_group( self, @@ -923,9 +931,9 @@ def _get_image_id_and_username_cache_key( ) @cachedmethod( - cache=lambda self: self._get_image_id_and_username_cache, + cache=lambda self: self._image_id_and_username_cache.cache, key=_get_image_id_and_username_cache_key, - lock=lambda self: self._get_image_id_and_username_cache_lock, + lock=lambda self: self._image_id_and_username_cache.lock, ) def _get_image_id_and_username( self, diff --git a/src/dstack/_internal/core/backends/base/compute.py b/src/dstack/_internal/core/backends/base/compute.py index 75a68e77ff..49513e3211 100644 --- a/src/dstack/_internal/core/backends/base/compute.py +++ b/src/dstack/_internal/core/backends/base/compute.py @@ -6,6 +6,7 @@ import threading from abc import ABC, abstractmethod from collections.abc import Iterable, Iterator +from dataclasses import dataclass, field from enum import Enum from functools import lru_cache from pathlib import Path @@ -14,7 +15,7 @@ import git import requests import yaml -from cachetools import TTLCache, cachedmethod +from cachetools import Cache, TTLCache, cachedmethod from gpuhunt import CPUArchitecture from dstack._internal import settings @@ -89,6 +90,18 @@ def to_cpu_architecture(self) -> CPUArchitecture: assert False, self +@dataclass +class ComputeCache: + cache: Cache + lock: threading.Lock = field(default_factory=threading.Lock) + + +@dataclass +class ComputeTTLCache: + cache: TTLCache + lock: threading.Lock = field(default_factory=threading.Lock) + + class Compute(ABC): """ A base class for all compute implementations with minimal features. diff --git a/src/dstack/_internal/core/backends/gcp/compute.py b/src/dstack/_internal/core/backends/gcp/compute.py index c2c18e3d9f..cd5ecb829f 100644 --- a/src/dstack/_internal/core/backends/gcp/compute.py +++ b/src/dstack/_internal/core/backends/gcp/compute.py @@ -1,7 +1,6 @@ import concurrent.futures import json import re -import threading from collections import defaultdict from collections.abc import Iterable from dataclasses import dataclass @@ -19,6 +18,7 @@ from dstack import version from dstack._internal.core.backends.base.compute import ( Compute, + ComputeTTLCache, ComputeWithAllOffersCached, ComputeWithCreateInstanceSupport, ComputeWithGatewaySupport, @@ -127,11 +127,9 @@ def __init__(self, config: GCPConfig): credentials=self.credentials ) self.reservations_client = compute_v1.ReservationsClient(credentials=self.credentials) - self._usable_subnets_cache_lock = threading.Lock() - self._usable_subnets_cache = TTLCache(maxsize=1, ttl=120) - self._find_reservation_cache_lock = threading.Lock() - # smaller TTL, since we check the reservation's in_use_count, which can change often - self._find_reservation_cache = TTLCache(maxsize=8, ttl=20) + self._usable_subnets_cache = ComputeTTLCache(cache=TTLCache(maxsize=1, ttl=120)) + # Smaller TTL since we check the reservation's in_use_count, which can change often + self._reservation_cache = ComputeTTLCache(cache=TTLCache(maxsize=8, ttl=20)) def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: regions = get_or_error(self.config.regions) @@ -948,8 +946,8 @@ def _get_roce_subnets( return nic_subnets @cachedmethod( - cache=lambda self: self._usable_subnets_cache, - lock=lambda self: self._usable_subnets_cache_lock, + cache=lambda self: self._usable_subnets_cache.cache, + lock=lambda self: self._usable_subnets_cache.lock, ) def _list_usable_subnets(self) -> list[compute_v1.UsableSubnetwork]: # To avoid hitting the `ListUsable requests per minute` system limit, we fetch all subnets @@ -969,8 +967,8 @@ def _get_vpc_subnet(self, region: str) -> Optional[str]: ) @cachedmethod( - cache=lambda self: self._find_reservation_cache, - lock=lambda self: self._find_reservation_cache_lock, + cache=lambda self: self._reservation_cache.cache, + lock=lambda self: self._reservation_cache.lock, ) def _find_reservation(self, configured_name: str) -> dict[str, compute_v1.Reservation]: if match := RESERVATION_PATTERN.fullmatch(configured_name): diff --git a/src/dstack/_internal/server/services/backends/__init__.py b/src/dstack/_internal/server/services/backends/__init__.py index 53284e6175..ce0f17bde5 100644 --- a/src/dstack/_internal/server/services/backends/__init__.py +++ b/src/dstack/_internal/server/services/backends/__init__.py @@ -1,5 +1,6 @@ import asyncio import heapq +import time from collections.abc import Iterable, Iterator from typing import Callable, Coroutine, Dict, List, Optional, Tuple from uuid import UUID @@ -361,7 +362,7 @@ def get_filtered_offers_with_backends( yield (backend, offer) logger.info("Requesting instance offers from backends: %s", [b.TYPE.value for b in backends]) - tasks = [run_async(backend.compute().get_offers, requirements) for backend in backends] + tasks = [run_async(get_offers_tracked, backend, requirements) for backend in backends] offers_by_backend = [] for backend, result in zip(backends, await asyncio.gather(*tasks, return_exceptions=True)): if isinstance(result, BackendError): @@ -391,3 +392,13 @@ def check_backend_type_available(backend_type: BackendType): " Ensure that backend dependencies are installed." f" Available backends: {[b.value for b in list_available_backend_types()]}." ) + + +def get_offers_tracked( + backend: Backend, requirements: Requirements +) -> Iterator[InstanceOfferWithAvailability]: + start = time.time() + res = backend.compute().get_offers(requirements) + duration = time.time() - start + logger.debug("Got offers from %s in %.6fs", backend.TYPE.value, duration) + return res From 32fbc02816e2ea5f10302bbf372c1142d952de0a Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 21 Jan 2026 10:07:53 +0100 Subject: [PATCH 065/187] [UI] Minor re-order in the sidebar (#3484) --- frontend/src/layouts/AppLayout/hooks.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/layouts/AppLayout/hooks.ts b/frontend/src/layouts/AppLayout/hooks.ts index a305317d50..f46366fcd6 100644 --- a/frontend/src/layouts/AppLayout/hooks.ts +++ b/frontend/src/layouts/AppLayout/hooks.ts @@ -25,11 +25,11 @@ export const useSideNavigation = () => { const generalLinks = [ { type: 'link', text: t('navigation.runs'), href: ROUTES.RUNS.LIST }, { type: 'link', text: t('navigation.offers'), href: ROUTES.OFFERS.LIST }, - { type: 'link', text: t('navigation.models'), href: ROUTES.MODELS.LIST }, { type: 'link', text: t('navigation.fleets'), href: ROUTES.FLEETS.LIST }, { type: 'link', text: t('navigation.instances'), href: ROUTES.INSTANCES.LIST }, { type: 'link', text: t('navigation.volumes'), href: ROUTES.VOLUMES.LIST }, { type: 'link', text: t('navigation.events'), href: ROUTES.EVENTS.LIST }, + { type: 'link', text: t('navigation.models'), href: ROUTES.MODELS.LIST }, { type: 'link', text: t('navigation.project_other'), href: ROUTES.PROJECT.LIST }, isGlobalAdmin && { From 6d14aadcb1ee7309f7b7f97d2c2239a5f2766470 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Wed, 21 Jan 2026 16:14:37 +0500 Subject: [PATCH 066/187] Add missing Box imports (#3485) --- frontend/src/pages/User/Details/index.tsx | 2 +- frontend/src/pages/User/List/index.tsx | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/frontend/src/pages/User/Details/index.tsx b/frontend/src/pages/User/Details/index.tsx index 1ee131094c..8f1b2d393d 100644 --- a/frontend/src/pages/User/Details/index.tsx +++ b/frontend/src/pages/User/Details/index.tsx @@ -2,7 +2,7 @@ import React, { useEffect, useState } from 'react'; import { useTranslation } from 'react-i18next'; import { Outlet, useNavigate, useParams } from 'react-router-dom'; -import { ConfirmationDialog, ContentLayout, SpaceBetween, Tabs } from 'components'; +import { Box, ConfirmationDialog, ContentLayout, SpaceBetween, Tabs } from 'components'; import { DetailsHeader } from 'components'; import { useNotifications /* usePermissionGuard*/ } from 'hooks'; diff --git a/frontend/src/pages/User/List/index.tsx b/frontend/src/pages/User/List/index.tsx index 5831d03383..61d14e83fc 100644 --- a/frontend/src/pages/User/List/index.tsx +++ b/frontend/src/pages/User/List/index.tsx @@ -4,6 +4,7 @@ import { useNavigate } from 'react-router-dom'; import { format } from 'date-fns'; import { + Box, Button, ConfirmationDialog, Header, From f09d06180cca3ac93933840f7153161e60444471 Mon Sep 17 00:00:00 2001 From: Oleg Date: Wed, 21 Jan 2026 15:00:48 +0300 Subject: [PATCH 067/187] Hotfix. Fixed generation fleet fields in project forms (#3486) --- frontend/src/pages/Fleets/Add/FleetFormFields/index.tsx | 4 ++-- frontend/src/pages/Project/Add/index.tsx | 2 +- frontend/src/pages/Project/CreateWizard/index.tsx | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/frontend/src/pages/Fleets/Add/FleetFormFields/index.tsx b/frontend/src/pages/Fleets/Add/FleetFormFields/index.tsx index 96a12ef4c8..a19e701366 100644 --- a/frontend/src/pages/Fleets/Add/FleetFormFields/index.tsx +++ b/frontend/src/pages/Fleets/Add/FleetFormFields/index.tsx @@ -18,12 +18,12 @@ export function FleetFormFields({ const { t } = useTranslation(); const [openHelpPanel] = useHelpPanel(); - const getFieldNameWitPrefix = (name: string) => { + const getFieldNameWitPrefix = (name: string): string => { if (!fieldNamePrefix) { return name; } - [fieldNamePrefix, name].join('.'); + return [fieldNamePrefix, name].join('.'); }; return ( diff --git a/frontend/src/pages/Project/Add/index.tsx b/frontend/src/pages/Project/Add/index.tsx index 14bbcc71fe..23a9eb0a19 100644 --- a/frontend/src/pages/Project/Add/index.tsx +++ b/frontend/src/pages/Project/Add/index.tsx @@ -302,7 +302,7 @@ export const ProjectAdd: React.FC = () => { control={control} disabledAllFields={loading} - fieldNamePrefix="fleet." + fieldNamePrefix="fleet" /> )} diff --git a/frontend/src/pages/Project/CreateWizard/index.tsx b/frontend/src/pages/Project/CreateWizard/index.tsx index 4981efb6f1..83cf3804d3 100644 --- a/frontend/src/pages/Project/CreateWizard/index.tsx +++ b/frontend/src/pages/Project/CreateWizard/index.tsx @@ -442,7 +442,7 @@ export const CreateProjectWizard: React.FC = () => { control={control} disabledAllFields={loading} - fieldNamePrefix="fleet." + fieldNamePrefix="fleet" /> )} From 2ad526cdcd0588f1657b0c3481295252c5a67ea1 Mon Sep 17 00:00:00 2001 From: Oleg Vavilov Date: Wed, 21 Jan 2026 20:33:03 +0300 Subject: [PATCH 068/187] Small fix --- frontend/src/locale/en.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index 5cd0808be4..c821abf317 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -586,6 +586,9 @@ "terminating": "Terminating", "terminated": "Terminated" }, + "create": { + "success_notification": "The fleet is created!" + }, "instances": { "active_only": "Active instances", "filter_property_placeholder": "Filter by properties", From d3beebc926cd68ff2f9364f35c52f66aa6665b5b Mon Sep 17 00:00:00 2001 From: Oleg Vavilov Date: Wed, 21 Jan 2026 22:11:21 +0300 Subject: [PATCH 069/187] Added react rules for eslint --- frontend/eslint.config.cjs | 8 +- frontend/package-lock.json | 2181 ++++++++++++++++++++---- frontend/package.json | 3 +- frontend/src/components/Code/index.tsx | 2 + 4 files changed, 1858 insertions(+), 336 deletions(-) diff --git a/frontend/eslint.config.cjs b/frontend/eslint.config.cjs index 3cfe1c41e4..9750398241 100644 --- a/frontend/eslint.config.cjs +++ b/frontend/eslint.config.cjs @@ -1,6 +1,7 @@ const { defineConfig, globalIgnores } = require('eslint/config'); const i18N = require('eslint-plugin-i18n'); const simpleImportSort = require('eslint-plugin-simple-import-sort'); +const react = require('eslint-plugin-react'); const { FlatCompat } = require('@eslint/eslintrc'); const js = require('@eslint/js'); const typescriptEslint = require('@typescript-eslint/eslint-plugin'); @@ -18,20 +19,25 @@ const BASE_CONFIG = { 'plugin:@typescript-eslint/eslint-recommended', 'plugin:@typescript-eslint/recommended', 'prettier', - 'plugin:prettier/recommended' + 'plugin:prettier/recommended', + 'plugin:react/recommended', ), plugins: { '@typescript-eslint': typescriptEslint, i18n: i18N, 'simple-import-sort': simpleImportSort, + react: react, }, languageOptions: { parser: tsParser, }, + settings: {}, rules: { + 'react/jsx-no-target-blank': 'off', + 'react/no-unescaped-entities': 'off', 'i18n/no-russian-character': 1, 'simple-import-sort/imports': [ diff --git a/frontend/package-lock.json b/frontend/package-lock.json index e66eba969c..a3d3f29d93 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -81,10 +81,13 @@ "cross-env": "^7.0.3", "css-loader": "^6.7.3", "enzyme": "^3.11.0", - "eslint": "^9.32.0", + "eslint": "^9.39.2", "eslint-config-prettier": "^10.1.5", "eslint-plugin-i18n": "^2.4.0", + "eslint-plugin-import": "^2.32.0", + "eslint-plugin-import-x": "^4.16.1", "eslint-plugin-prettier": "^5.4.1", + "eslint-plugin-react": "^7.37.5", "eslint-plugin-simple-import-sort": "^12.1.1", "favicons": "^7.2.0", "favicons-webpack-plugin": "^6.0.1", @@ -2548,11 +2551,35 @@ "react": ">=16.8.0" } }, + "node_modules/@emnapi/core": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.8.1.tgz", + "integrity": "sha512-AvT9QFpxK0Zd8J0jopedNm+w/2fIzvtPKPjqyw9jwvBaReTTqPBk9Hixaz7KbjimP+QNz605/XnjFcDAL2pqBg==", + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "@emnapi/wasi-threads": "1.1.0", + "tslib": "^2.4.0" + } + }, "node_modules/@emnapi/runtime": { - "version": "1.3.1", - "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.3.1.tgz", - "integrity": "sha512-kEBmG8KyqtxJZv+ygbEim+KCGtIq1fC22Ms3S4ziXmYKm8uyoLX0MHONVKwp+9opg390VaKRNt4a7A9NwmpNhw==", + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.8.1.tgz", + "integrity": "sha512-mehfKSMWjjNol8659Z8KxEMrdSJDDot5SXMq00dM8BN4o+CLNXQ0xH2V7EchNHV4RmbZLmmPdEaXZc5H2FXmDg==", + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "tslib": "^2.4.0" + } + }, + "node_modules/@emnapi/wasi-threads": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.1.0.tgz", + "integrity": "sha512-WI0DdZ8xFSbgMjR1sFsKABJ/C5OnRrjT06JXbZKexJGrDuPTzZdDYfFlsgcCXCyf+suG5QU2e/y1Wo2V/OapLQ==", "dev": true, + "license": "MIT", "optional": true, "dependencies": { "tslib": "^2.4.0" @@ -2583,9 +2610,9 @@ "peer": true }, "node_modules/@eslint-community/eslint-utils": { - "version": "4.7.0", - "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.7.0.tgz", - "integrity": "sha512-dyybb3AcajC7uha6CvhdVRJqaKyn7w2YKqKyAN37NKYgZT36w+iRb0Dymmc5qEJ549c/S31cMMSFd75bteCpCw==", + "version": "4.9.1", + "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.9.1.tgz", + "integrity": "sha512-phrYmNiYppR7znFEdqgfWHXR6NCkZEK7hwWDHZUjit/2/U0r6XvkDl0SYnoM51Hq7FhCGdLDT6zxCCOY1hexsQ==", "dev": true, "license": "MIT", "dependencies": { @@ -2611,13 +2638,13 @@ } }, "node_modules/@eslint/config-array": { - "version": "0.21.0", - "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.21.0.tgz", - "integrity": "sha512-ENIdc4iLu0d93HeYirvKmrzshzofPw6VkZRKQGe9Nv46ZnWUzcF1xV01dcvEg/1wXUR61OmmlSfyeyO7EvjLxQ==", + "version": "0.21.1", + "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.21.1.tgz", + "integrity": "sha512-aw1gNayWpdI/jSYVgzN5pL0cfzU02GT3NBpeT/DXbx1/1x7ZKxFPd9bwrzygx/qiwIQiJ1sw/zD8qY/kRvlGHA==", "dev": true, "license": "Apache-2.0", "dependencies": { - "@eslint/object-schema": "^2.1.6", + "@eslint/object-schema": "^2.1.7", "debug": "^4.3.1", "minimatch": "^3.1.2" }, @@ -2626,19 +2653,22 @@ } }, "node_modules/@eslint/config-helpers": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/@eslint/config-helpers/-/config-helpers-0.3.0.tgz", - "integrity": "sha512-ViuymvFmcJi04qdZeDc2whTHryouGcDlaxPqarTD0ZE10ISpxGUVZGZDx4w01upyIynL3iu6IXH2bS1NhclQMw==", + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/@eslint/config-helpers/-/config-helpers-0.4.2.tgz", + "integrity": "sha512-gBrxN88gOIf3R7ja5K9slwNayVcZgK6SOUORm2uBzTeIEfeVaIhOpCtTox3P6R7o2jLFwLFTLnC7kU/RGcYEgw==", "dev": true, "license": "Apache-2.0", + "dependencies": { + "@eslint/core": "^0.17.0" + }, "engines": { "node": "^18.18.0 || ^20.9.0 || >=21.1.0" } }, "node_modules/@eslint/core": { - "version": "0.15.1", - "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.15.1.tgz", - "integrity": "sha512-bkOp+iumZCCbt1K1CmWf0R9pM5yKpDv+ZXtvSyQpudrI9kuFLp+bM2WOPXImuD/ceQuaa8f5pj93Y7zyECIGNA==", + "version": "0.17.0", + "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.17.0.tgz", + "integrity": "sha512-yL/sLrpmtDaFEiUj1osRP4TI2MDz1AddJL+jZ7KSqvBuliN4xqYY54IfdN8qD8Toa6g1iloph1fxQNkjOxrrpQ==", "dev": true, "license": "Apache-2.0", "dependencies": { @@ -2710,9 +2740,9 @@ "license": "MIT" }, "node_modules/@eslint/js": { - "version": "9.32.0", - "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.32.0.tgz", - "integrity": "sha512-BBpRFZK3eX6uMLKz8WxFOBIFFcGFJ/g8XuwjTHCqHROSIsopI+ddn/d5Cfh36+7+e5edVS8dbSHnBNhrLEX0zg==", + "version": "9.39.2", + "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.39.2.tgz", + "integrity": "sha512-q1mjIoW1VX4IvSocvM/vbTiveKC4k9eLrajNEuSsmjymSDEbpGddtpfOoN7YGAqBK3NG+uqo8ia4PDTt8buCYA==", "dev": true, "license": "MIT", "engines": { @@ -2723,9 +2753,9 @@ } }, "node_modules/@eslint/object-schema": { - "version": "2.1.6", - "resolved": "https://registry.npmjs.org/@eslint/object-schema/-/object-schema-2.1.6.tgz", - "integrity": "sha512-RBMg5FRL0I0gs51M/guSAj5/e14VQ4tpZnQNWwuDT66P14I43ItmPfIZRhO9fUVIPOAQXU47atlywZ/czoqFPA==", + "version": "2.1.7", + "resolved": "https://registry.npmjs.org/@eslint/object-schema/-/object-schema-2.1.7.tgz", + "integrity": "sha512-VtAOaymWVfZcmZbp6E2mympDIHvyjXs/12LqWYjVw6qjrfF+VK+fyG33kChz3nnK+SU5/NeHOqrTEHS8sXO3OA==", "dev": true, "license": "Apache-2.0", "engines": { @@ -2733,13 +2763,13 @@ } }, "node_modules/@eslint/plugin-kit": { - "version": "0.3.4", - "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.3.4.tgz", - "integrity": "sha512-Ul5l+lHEcw3L5+k8POx6r74mxEYKG5kOb6Xpy2gCRW6zweT6TEhAf8vhxGgjhqrd/VO/Dirhsb+1hNpD1ue9hw==", + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.4.1.tgz", + "integrity": "sha512-43/qtrDUokr7LJqoF2c3+RInu/t4zfrpYdoSDfYyhg52rwLV6TnOvdG4fXm7IkSB3wErkcmJS9iEhjVtOSEjjA==", "dev": true, "license": "Apache-2.0", "dependencies": { - "@eslint/core": "^0.15.1", + "@eslint/core": "^0.17.0", "levn": "^0.4.1" }, "engines": { @@ -3231,6 +3261,29 @@ "url": "https://opencollective.com/libvips" } }, + "node_modules/@isaacs/balanced-match": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/@isaacs/balanced-match/-/balanced-match-4.0.1.tgz", + "integrity": "sha512-yzMTt9lEb8Gv7zRioUilSglI0c0smZ9k5D65677DLWLtWJaXIS3CqcGyUFByYKlnUj6TkjLVs54fBl6+TiGQDQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": "20 || >=22" + } + }, + "node_modules/@isaacs/brace-expansion": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/@isaacs/brace-expansion/-/brace-expansion-5.0.0.tgz", + "integrity": "sha512-ZT55BDLV0yv0RBm2czMiZ+SqCGO7AvmOM3G/w2xhVPH+te0aKgFjmBvGlL1dH+ql2tgGO3MVrbb3jCKyvpgnxA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@isaacs/balanced-match": "^4.0.1" + }, + "engines": { + "node": "20 || >=22" + } + }, "node_modules/@istanbuljs/load-nyc-config": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@istanbuljs/load-nyc-config/-/load-nyc-config-1.1.0.tgz", @@ -3820,6 +3873,19 @@ "integrity": "sha512-Vo+PSpZG2/fmgmiNzYK9qWRh8h/CHrwD0mo1h1DzL4yzHNSfWYujGTYsWGreD000gcgmZ7K4Ys6Tx9TxtsKdDw==", "dev": true }, + "node_modules/@napi-rs/wasm-runtime": { + "version": "0.2.12", + "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-0.2.12.tgz", + "integrity": "sha512-ZVWUcfwY4E/yPitQJl481FjFo3K22D6qF0DuFH6Y/nbnE11GY5uguDxZMGXPQ8WQ0128MXQD7TnfHyK4oWoIJQ==", + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "@emnapi/core": "^1.4.3", + "@emnapi/runtime": "^1.4.3", + "@tybys/wasm-util": "^0.10.0" + } + }, "node_modules/@nicolo-ribaudo/chokidar-2": { "version": "2.1.8-no-fsevents.3", "resolved": "https://registry.npmjs.org/@nicolo-ribaudo/chokidar-2/-/chokidar-2-2.1.8-no-fsevents.3.tgz", @@ -4296,6 +4362,13 @@ "node": ">=10" } }, + "node_modules/@rtsao/scc": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@rtsao/scc/-/scc-1.1.0.tgz", + "integrity": "sha512-zt6OdqaDoOnJ1ZYsCYGt9YmWzDXl4vQdKTyJev62gFhRGKdx7mcT54V9KIjg+d2wi9EXsPvAPKe7i7WjfVWB8g==", + "dev": true, + "license": "MIT" + }, "node_modules/@sinclair/typebox": { "version": "0.27.8", "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz", @@ -4707,6 +4780,17 @@ "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==", "dev": true }, + "node_modules/@tybys/wasm-util": { + "version": "0.10.1", + "resolved": "https://registry.npmjs.org/@tybys/wasm-util/-/wasm-util-0.10.1.tgz", + "integrity": "sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg==", + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "tslib": "^2.4.0" + } + }, "node_modules/@types/aria-query": { "version": "5.0.4", "resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz", @@ -5016,6 +5100,13 @@ "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz", "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==" }, + "node_modules/@types/json5": { + "version": "0.0.29", + "resolved": "https://registry.npmjs.org/@types/json5/-/json5-0.0.29.tgz", + "integrity": "sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/lodash": { "version": "4.17.13", "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.13.tgz", @@ -5534,6 +5625,275 @@ "url": "https://opencollective.com/eslint" } }, + "node_modules/@unrs/resolver-binding-android-arm-eabi": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-android-arm-eabi/-/resolver-binding-android-arm-eabi-1.11.1.tgz", + "integrity": "sha512-ppLRUgHVaGRWUx0R0Ut06Mjo9gBaBkg3v/8AxusGLhsIotbBLuRk51rAzqLC8gq6NyyAojEXglNjzf6R948DNw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@unrs/resolver-binding-android-arm64": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-android-arm64/-/resolver-binding-android-arm64-1.11.1.tgz", + "integrity": "sha512-lCxkVtb4wp1v+EoN+HjIG9cIIzPkX5OtM03pQYkG+U5O/wL53LC4QbIeazgiKqluGeVEeBlZahHalCaBvU1a2g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@unrs/resolver-binding-darwin-arm64": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-darwin-arm64/-/resolver-binding-darwin-arm64-1.11.1.tgz", + "integrity": "sha512-gPVA1UjRu1Y/IsB/dQEsp2V1pm44Of6+LWvbLc9SDk1c2KhhDRDBUkQCYVWe6f26uJb3fOK8saWMgtX8IrMk3g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@unrs/resolver-binding-darwin-x64": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-darwin-x64/-/resolver-binding-darwin-x64-1.11.1.tgz", + "integrity": "sha512-cFzP7rWKd3lZaCsDze07QX1SC24lO8mPty9vdP+YVa3MGdVgPmFc59317b2ioXtgCMKGiCLxJ4HQs62oz6GfRQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@unrs/resolver-binding-freebsd-x64": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-freebsd-x64/-/resolver-binding-freebsd-x64-1.11.1.tgz", + "integrity": "sha512-fqtGgak3zX4DCB6PFpsH5+Kmt/8CIi4Bry4rb1ho6Av2QHTREM+47y282Uqiu3ZRF5IQioJQ5qWRV6jduA+iGw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@unrs/resolver-binding-linux-arm-gnueabihf": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-arm-gnueabihf/-/resolver-binding-linux-arm-gnueabihf-1.11.1.tgz", + "integrity": "sha512-u92mvlcYtp9MRKmP+ZvMmtPN34+/3lMHlyMj7wXJDeXxuM0Vgzz0+PPJNsro1m3IZPYChIkn944wW8TYgGKFHw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@unrs/resolver-binding-linux-arm-musleabihf": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-arm-musleabihf/-/resolver-binding-linux-arm-musleabihf-1.11.1.tgz", + "integrity": "sha512-cINaoY2z7LVCrfHkIcmvj7osTOtm6VVT16b5oQdS4beibX2SYBwgYLmqhBjA1t51CarSaBuX5YNsWLjsqfW5Cw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@unrs/resolver-binding-linux-arm64-gnu": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-arm64-gnu/-/resolver-binding-linux-arm64-gnu-1.11.1.tgz", + "integrity": "sha512-34gw7PjDGB9JgePJEmhEqBhWvCiiWCuXsL9hYphDF7crW7UgI05gyBAi6MF58uGcMOiOqSJ2ybEeCvHcq0BCmQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@unrs/resolver-binding-linux-arm64-musl": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-arm64-musl/-/resolver-binding-linux-arm64-musl-1.11.1.tgz", + "integrity": "sha512-RyMIx6Uf53hhOtJDIamSbTskA99sPHS96wxVE/bJtePJJtpdKGXO1wY90oRdXuYOGOTuqjT8ACccMc4K6QmT3w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@unrs/resolver-binding-linux-ppc64-gnu": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-ppc64-gnu/-/resolver-binding-linux-ppc64-gnu-1.11.1.tgz", + "integrity": "sha512-D8Vae74A4/a+mZH0FbOkFJL9DSK2R6TFPC9M+jCWYia/q2einCubX10pecpDiTmkJVUH+y8K3BZClycD8nCShA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@unrs/resolver-binding-linux-riscv64-gnu": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-riscv64-gnu/-/resolver-binding-linux-riscv64-gnu-1.11.1.tgz", + "integrity": "sha512-frxL4OrzOWVVsOc96+V3aqTIQl1O2TjgExV4EKgRY09AJ9leZpEg8Ak9phadbuX0BA4k8U5qtvMSQQGGmaJqcQ==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@unrs/resolver-binding-linux-riscv64-musl": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-riscv64-musl/-/resolver-binding-linux-riscv64-musl-1.11.1.tgz", + "integrity": "sha512-mJ5vuDaIZ+l/acv01sHoXfpnyrNKOk/3aDoEdLO/Xtn9HuZlDD6jKxHlkN8ZhWyLJsRBxfv9GYM2utQ1SChKew==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@unrs/resolver-binding-linux-s390x-gnu": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-s390x-gnu/-/resolver-binding-linux-s390x-gnu-1.11.1.tgz", + "integrity": "sha512-kELo8ebBVtb9sA7rMe1Cph4QHreByhaZ2QEADd9NzIQsYNQpt9UkM9iqr2lhGr5afh885d/cB5QeTXSbZHTYPg==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@unrs/resolver-binding-linux-x64-gnu": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-x64-gnu/-/resolver-binding-linux-x64-gnu-1.11.1.tgz", + "integrity": "sha512-C3ZAHugKgovV5YvAMsxhq0gtXuwESUKc5MhEtjBpLoHPLYM+iuwSj3lflFwK3DPm68660rZ7G8BMcwSro7hD5w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@unrs/resolver-binding-linux-x64-musl": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-x64-musl/-/resolver-binding-linux-x64-musl-1.11.1.tgz", + "integrity": "sha512-rV0YSoyhK2nZ4vEswT/QwqzqQXw5I6CjoaYMOX0TqBlWhojUf8P94mvI7nuJTeaCkkds3QE4+zS8Ko+GdXuZtA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@unrs/resolver-binding-wasm32-wasi": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-wasm32-wasi/-/resolver-binding-wasm32-wasi-1.11.1.tgz", + "integrity": "sha512-5u4RkfxJm+Ng7IWgkzi3qrFOvLvQYnPBmjmZQ8+szTK/b31fQCnleNl1GgEt7nIsZRIf5PLhPwT0WM+q45x/UQ==", + "cpu": [ + "wasm32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "@napi-rs/wasm-runtime": "^0.2.11" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/@unrs/resolver-binding-win32-arm64-msvc": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-arm64-msvc/-/resolver-binding-win32-arm64-msvc-1.11.1.tgz", + "integrity": "sha512-nRcz5Il4ln0kMhfL8S3hLkxI85BXs3o8EYoattsJNdsX4YUU89iOkVn7g0VHSRxFuVMdM4Q1jEpIId1Ihim/Uw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@unrs/resolver-binding-win32-ia32-msvc": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-ia32-msvc/-/resolver-binding-win32-ia32-msvc-1.11.1.tgz", + "integrity": "sha512-DCEI6t5i1NmAZp6pFonpD5m7i6aFrpofcp4LA2i8IIq60Jyo28hamKBxNrZcyOwVOZkgsRp9O2sXWBWP8MnvIQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@unrs/resolver-binding-win32-x64-msvc": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-x64-msvc/-/resolver-binding-win32-x64-msvc-1.11.1.tgz", + "integrity": "sha512-lrW200hZdbfRtztbygyaq/6jP6AKE8qQN2KvPcJ+x7wiD038YtnYtZ82IMNJ69GJibV7bwL3y9FgK+5w/pYt6g==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, "node_modules/@webassemblyjs/ast": { "version": "1.12.1", "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.12.1.tgz", @@ -5982,13 +6342,14 @@ } }, "node_modules/array-buffer-byte-length": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.1.tgz", - "integrity": "sha512-ahC5W1xgou+KTXix4sAO8Ki12Q+jf4i0+tmk3sC+zgcynshkHxzpXdImBehiUYKKKDwvfFiJl1tZt6ewscS1Mg==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.2.tgz", + "integrity": "sha512-LHE+8BuR7RYGDKvnrmcuSq3tDcKv9OFEXQt/HpbZhY7V6h0zlUXutnAD82GiFx9rdieCMjkvtcsPqBwgUl1Iiw==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.5", - "is-array-buffer": "^3.0.4" + "call-bound": "^1.0.3", + "is-array-buffer": "^3.0.5" }, "engines": { "node": ">= 0.4" @@ -6003,8 +6364,31 @@ "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==", "dev": true }, - "node_modules/array-union": { - "version": "2.1.0", + "node_modules/array-includes": { + "version": "3.1.9", + "resolved": "https://registry.npmjs.org/array-includes/-/array-includes-3.1.9.tgz", + "integrity": "sha512-FmeCCAenzH0KH381SPT5FZmiA/TmpndpcaShhfgEN9eCVjnFBqq3l1xrI42y8+PPLI6hypzou4GXw00WHmPBLQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "define-properties": "^1.2.1", + "es-abstract": "^1.24.0", + "es-object-atoms": "^1.1.1", + "get-intrinsic": "^1.3.0", + "is-string": "^1.1.1", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/array-union": { + "version": "2.1.0", "resolved": "https://registry.npmjs.org/array-union/-/array-union-2.1.0.tgz", "integrity": "sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw==", "dev": true, @@ -6032,16 +6416,79 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/array.prototype.findlast": { + "version": "1.2.5", + "resolved": "https://registry.npmjs.org/array.prototype.findlast/-/array.prototype.findlast-1.2.5.tgz", + "integrity": "sha512-CVvd6FHg1Z3POpBLxO6E6zr+rSKEQ9L6rZHAaY7lLfhKsWYUBBOuMs0e9o24oopj6H+geRCX0YJ+TJLBK2eHyQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.7", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.2", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0", + "es-shim-unscopables": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/array.prototype.findlastindex": { + "version": "1.2.6", + "resolved": "https://registry.npmjs.org/array.prototype.findlastindex/-/array.prototype.findlastindex-1.2.6.tgz", + "integrity": "sha512-F/TKATkzseUExPlfvmwQKGITM3DGTK+vkAsCZoDc5daVygbJBnjEUCbgkAvVFsgfXfX4YIqZ/27G3k3tdXrTxQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.9", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "es-shim-unscopables": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/array.prototype.flat": { - "version": "1.3.2", - "resolved": "https://registry.npmjs.org/array.prototype.flat/-/array.prototype.flat-1.3.2.tgz", - "integrity": "sha512-djYB+Zx2vLewY8RWlNCUdHjDXs2XOgm602S9E7P/UpHgfeHL00cRiIF+IN/G/aUJ7kGPb6yO/ErDI5V2s8iycA==", + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/array.prototype.flat/-/array.prototype.flat-1.3.3.tgz", + "integrity": "sha512-rwG/ja1neyLqCuGZ5YYrznA62D4mZXg0i1cIskIUKSiqF3Cje9/wXAls9B9s1Wa2fomMsIv8czB8jZcPmxCXFg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.5", + "es-shim-unscopables": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/array.prototype.flatmap": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/array.prototype.flatmap/-/array.prototype.flatmap-1.3.3.tgz", + "integrity": "sha512-Y7Wt51eKJSyi80hFrJCePGGNo5ktJCslFuboqJsbf57CCPcm5zztluPlc4/aD8sWsKvlwatezpV4U1efk8kpjg==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.2", - "define-properties": "^1.2.0", - "es-abstract": "^1.22.1", - "es-shim-unscopables": "^1.0.0" + "call-bind": "^1.0.8", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.5", + "es-shim-unscopables": "^1.0.2" }, "engines": { "node": ">= 0.4" @@ -6050,20 +6497,37 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/array.prototype.tosorted": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/array.prototype.tosorted/-/array.prototype.tosorted-1.1.4.tgz", + "integrity": "sha512-p6Fx8B7b7ZhL/gmUsAy0D15WhvDccw3mnGNbZpi3pmeJdxtWsj2jEaI4Y6oo3XiHfzuSgPwKc04MYt6KgvC/wA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.7", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.3", + "es-errors": "^1.3.0", + "es-shim-unscopables": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/arraybuffer.prototype.slice": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/arraybuffer.prototype.slice/-/arraybuffer.prototype.slice-1.0.3.tgz", - "integrity": "sha512-bMxMKAjg13EBSVscxTaYA4mRc5t1UAXa2kXiGTNfZ079HIWXEkKmkgFrh/nJqamaLSrXO5H4WFFkPEaLJWbs3A==", + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/arraybuffer.prototype.slice/-/arraybuffer.prototype.slice-1.0.4.tgz", + "integrity": "sha512-BNoCY6SXXPQ7gF2opIP4GBE+Xw7U+pHMYKuzjgCN3GwiaIR09UUeKfheyIry77QtrCBlC0KK0q5/TER/tYh3PQ==", "dev": true, + "license": "MIT", "dependencies": { "array-buffer-byte-length": "^1.0.1", - "call-bind": "^1.0.5", + "call-bind": "^1.0.8", "define-properties": "^1.2.1", - "es-abstract": "^1.22.3", - "es-errors": "^1.2.1", - "get-intrinsic": "^1.2.3", - "is-array-buffer": "^3.0.4", - "is-shared-array-buffer": "^1.0.2" + "es-abstract": "^1.23.5", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "is-array-buffer": "^3.0.4" }, "engines": { "node": ">= 0.4" @@ -6078,6 +6542,16 @@ "integrity": "sha512-htCUDlxyyCLMgaM3xXg0C0LW2xqfuQ6p05pCEIsXuyQ+a1koYKTuBMzRNwmybfLgvJDMd0r1LTn4+E0Ti6C2AA==", "dev": true }, + "node_modules/async-function": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/async-function/-/async-function-1.0.0.tgz", + "integrity": "sha512-hsU18Ae8CDTR6Kgu9DYf0EbCr/a5iGL0rytQDobUcdpYOKokk8LEjVphnXkDkgpi0wYVsqrXuP0bZxJaTqdgoA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, "node_modules/asynckit": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", @@ -6143,6 +6617,7 @@ "resolved": "https://registry.npmjs.org/available-typed-arrays/-/available-typed-arrays-1.0.7.tgz", "integrity": "sha512-wvUjBtSGN7+7SjNpq/9M2Tg350UZD3q62IFZLbRAR1bSMlCo1ZaeW+BJ+D090e4hIIZLBcTDWe4Mh4jvUDajzQ==", "dev": true, + "license": "MIT", "dependencies": { "possible-typed-array-names": "^1.0.0" }, @@ -6902,16 +7377,47 @@ } }, "node_modules/call-bind": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz", - "integrity": "sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==", + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.8.tgz", + "integrity": "sha512-oKlSFMcMwpUg2ednkhQ454wfWiU/ul3CkJe/PEHcTKuiX6RpbehUiFMXu13HalGZxfUwCQzZG747YXBn1im9ww==", "dev": true, + "license": "MIT", "dependencies": { + "call-bind-apply-helpers": "^1.0.0", "es-define-property": "^1.0.0", - "es-errors": "^1.3.0", - "function-bind": "^1.1.2", "get-intrinsic": "^1.2.4", - "set-function-length": "^1.2.1" + "set-function-length": "^1.2.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/call-bound": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz", + "integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "get-intrinsic": "^1.3.0" }, "engines": { "node": ">= 0.4" @@ -7370,6 +7876,16 @@ "node": ">= 6" } }, + "node_modules/comment-parser": { + "version": "1.4.4", + "resolved": "https://registry.npmjs.org/comment-parser/-/comment-parser-1.4.4.tgz", + "integrity": "sha512-0D6qSQ5IkeRrGJFHRClzaMOenMeT0gErz3zIw3AprKMqhRN6LNU2jQOdkPG/FZ+8bCgXE1VidrgSzlBBDZRr8A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 12.0.0" + } + }, "node_modules/common-path-prefix": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/common-path-prefix/-/common-path-prefix-3.0.0.tgz", @@ -8043,14 +8559,15 @@ } }, "node_modules/data-view-buffer": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.1.tgz", - "integrity": "sha512-0lht7OugA5x3iJLOWFhWK/5ehONdprk0ISXqVFn/NFrDu+cuc8iADFrGQz5BnRK7LLU3JmkbXSxaqX+/mXYtUA==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.2.tgz", + "integrity": "sha512-EmKO5V3OLXh1rtK2wgXRansaK1/mtVdTUEiEI0W8RkvgT05kfxaH29PliLnpLP73yYO6142Q72QNa8Wx/A5CqQ==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.6", + "call-bound": "^1.0.3", "es-errors": "^1.3.0", - "is-data-view": "^1.0.1" + "is-data-view": "^1.0.2" }, "engines": { "node": ">= 0.4" @@ -8060,29 +8577,31 @@ } }, "node_modules/data-view-byte-length": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/data-view-byte-length/-/data-view-byte-length-1.0.1.tgz", - "integrity": "sha512-4J7wRJD3ABAzr8wP+OcIcqq2dlUKp4DVflx++hs5h5ZKydWMI6/D/fAot+yh6g2tHh8fLFTvNOaVN357NvSrOQ==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/data-view-byte-length/-/data-view-byte-length-1.0.2.tgz", + "integrity": "sha512-tuhGbE6CfTM9+5ANGf+oQb72Ky/0+s3xKUpHvShfiz2RxMFgFPjsXuRLBVMtvMs15awe45SRb83D6wH4ew6wlQ==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.7", + "call-bound": "^1.0.3", "es-errors": "^1.3.0", - "is-data-view": "^1.0.1" + "is-data-view": "^1.0.2" }, "engines": { "node": ">= 0.4" }, "funding": { - "url": "https://github.com/sponsors/ljharb" + "url": "https://github.com/sponsors/inspect-js" } }, "node_modules/data-view-byte-offset": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/data-view-byte-offset/-/data-view-byte-offset-1.0.0.tgz", - "integrity": "sha512-t/Ygsytq+R995EJ5PZlD4Cu56sWa8InXySaViRzw9apusqsOO2bQP+SbYzAhR0pFKoB+43lYy8rWban9JSuXnA==", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/data-view-byte-offset/-/data-view-byte-offset-1.0.1.tgz", + "integrity": "sha512-BS8PfmtDGnrgYdOonGZQdLZslWIeCGFP9tpan0hi1Co2Zr2NKADsvGYA8XxuG/4UWgJ6Cjtv+YJnB6MM69QGlQ==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.6", + "call-bound": "^1.0.2", "es-errors": "^1.3.0", "is-data-view": "^1.0.1" }, @@ -8370,6 +8889,19 @@ "node": ">=6" } }, + "node_modules/doctrine": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-2.1.0.tgz", + "integrity": "sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "esutils": "^2.0.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/dom-accessibility-api": { "version": "0.5.16", "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.5.16.tgz", @@ -8464,6 +8996,21 @@ "tslib": "^2.0.3" } }, + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/duplexer": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/duplexer/-/duplexer-0.1.2.tgz", @@ -8659,57 +9206,66 @@ } }, "node_modules/es-abstract": { - "version": "1.23.3", - "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.23.3.tgz", - "integrity": "sha512-e+HfNH61Bj1X9/jLc5v1owaLYuHdeHHSQlkhCBiTK8rBvKaULl/beGMxwrMXjpYrv4pz22BlY570vVePA2ho4A==", + "version": "1.24.1", + "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.24.1.tgz", + "integrity": "sha512-zHXBLhP+QehSSbsS9Pt23Gg964240DPd6QCf8WpkqEXxQ7fhdZzYsocOr5u7apWonsS5EjZDmTF+/slGMyasvw==", "dev": true, + "license": "MIT", "dependencies": { - "array-buffer-byte-length": "^1.0.1", - "arraybuffer.prototype.slice": "^1.0.3", + "array-buffer-byte-length": "^1.0.2", + "arraybuffer.prototype.slice": "^1.0.4", "available-typed-arrays": "^1.0.7", - "call-bind": "^1.0.7", - "data-view-buffer": "^1.0.1", - "data-view-byte-length": "^1.0.1", - "data-view-byte-offset": "^1.0.0", - "es-define-property": "^1.0.0", + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "data-view-buffer": "^1.0.2", + "data-view-byte-length": "^1.0.2", + "data-view-byte-offset": "^1.0.1", + "es-define-property": "^1.0.1", "es-errors": "^1.3.0", - "es-object-atoms": "^1.0.0", - "es-set-tostringtag": "^2.0.3", - "es-to-primitive": "^1.2.1", - "function.prototype.name": "^1.1.6", - "get-intrinsic": "^1.2.4", - "get-symbol-description": "^1.0.2", - "globalthis": "^1.0.3", - "gopd": "^1.0.1", + "es-object-atoms": "^1.1.1", + "es-set-tostringtag": "^2.1.0", + "es-to-primitive": "^1.3.0", + "function.prototype.name": "^1.1.8", + "get-intrinsic": "^1.3.0", + "get-proto": "^1.0.1", + "get-symbol-description": "^1.1.0", + "globalthis": "^1.0.4", + "gopd": "^1.2.0", "has-property-descriptors": "^1.0.2", - "has-proto": "^1.0.3", - "has-symbols": "^1.0.3", + "has-proto": "^1.2.0", + "has-symbols": "^1.1.0", "hasown": "^2.0.2", - "internal-slot": "^1.0.7", - "is-array-buffer": "^3.0.4", + "internal-slot": "^1.1.0", + "is-array-buffer": "^3.0.5", "is-callable": "^1.2.7", - "is-data-view": "^1.0.1", + "is-data-view": "^1.0.2", "is-negative-zero": "^2.0.3", - "is-regex": "^1.1.4", - "is-shared-array-buffer": "^1.0.3", - "is-string": "^1.0.7", - "is-typed-array": "^1.1.13", - "is-weakref": "^1.0.2", - "object-inspect": "^1.13.1", + "is-regex": "^1.2.1", + "is-set": "^2.0.3", + "is-shared-array-buffer": "^1.0.4", + "is-string": "^1.1.1", + "is-typed-array": "^1.1.15", + "is-weakref": "^1.1.1", + "math-intrinsics": "^1.1.0", + "object-inspect": "^1.13.4", "object-keys": "^1.1.1", - "object.assign": "^4.1.5", - "regexp.prototype.flags": "^1.5.2", - "safe-array-concat": "^1.1.2", - "safe-regex-test": "^1.0.3", - "string.prototype.trim": "^1.2.9", - "string.prototype.trimend": "^1.0.8", + "object.assign": "^4.1.7", + "own-keys": "^1.0.1", + "regexp.prototype.flags": "^1.5.4", + "safe-array-concat": "^1.1.3", + "safe-push-apply": "^1.0.0", + "safe-regex-test": "^1.1.0", + "set-proto": "^1.0.0", + "stop-iteration-iterator": "^1.1.0", + "string.prototype.trim": "^1.2.10", + "string.prototype.trimend": "^1.0.9", "string.prototype.trimstart": "^1.0.8", - "typed-array-buffer": "^1.0.2", - "typed-array-byte-length": "^1.0.1", - "typed-array-byte-offset": "^1.0.2", - "typed-array-length": "^1.0.6", - "unbox-primitive": "^1.0.2", - "which-typed-array": "^1.1.15" + "typed-array-buffer": "^1.0.3", + "typed-array-byte-length": "^1.0.3", + "typed-array-byte-offset": "^1.0.4", + "typed-array-length": "^1.0.7", + "unbox-primitive": "^1.1.0", + "which-typed-array": "^1.1.19" }, "engines": { "node": ">= 0.4" @@ -8725,13 +9281,11 @@ "dev": true }, "node_modules/es-define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz", - "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", "dev": true, - "dependencies": { - "get-intrinsic": "^1.2.4" - }, + "license": "MIT", "engines": { "node": ">= 0.4" } @@ -8745,16 +9299,45 @@ "node": ">= 0.4" } }, + "node_modules/es-iterator-helpers": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/es-iterator-helpers/-/es-iterator-helpers-1.2.2.tgz", + "integrity": "sha512-BrUQ0cPTB/IwXj23HtwHjS9n7O4h9FX94b4xc5zlTHxeLgTAdzYUDyy6KdExAl9lbN5rtfe44xpjpmj9grxs5w==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "define-properties": "^1.2.1", + "es-abstract": "^1.24.1", + "es-errors": "^1.3.0", + "es-set-tostringtag": "^2.1.0", + "function-bind": "^1.1.2", + "get-intrinsic": "^1.3.0", + "globalthis": "^1.0.4", + "gopd": "^1.2.0", + "has-property-descriptors": "^1.0.2", + "has-proto": "^1.2.0", + "has-symbols": "^1.1.0", + "internal-slot": "^1.1.0", + "iterator.prototype": "^1.1.5", + "safe-array-concat": "^1.1.3" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/es-module-lexer": { "version": "1.5.4", "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.5.4.tgz", "integrity": "sha512-MVNK56NiMrOwitFB7cqDwq0CQutbw+0BvLshJSse0MUNU+y1FC3bUS/AQg7oUng+/wKrrki7JfmwtVHkVfPLlw==" }, "node_modules/es-object-atoms": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.0.0.tgz", - "integrity": "sha512-MZ4iQ6JwHOBQjahnjwaC1ZtIBH+2ohjamzAO3oaHcXYup7qxjF2fixyH+Q71voWHeOkI2q/TnJao/KfXYIZWbw==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", "dev": true, + "license": "MIT", "dependencies": { "es-errors": "^1.3.0" }, @@ -8763,37 +9346,44 @@ } }, "node_modules/es-set-tostringtag": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.0.3.tgz", - "integrity": "sha512-3T8uNMC3OQTHkFUsFq8r/BwAXLHvU/9O9mE0fBc/MY5iq/8H7ncvO947LmYA6ldWw9Uh8Yhf25zu6n7nML5QWQ==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", "dev": true, + "license": "MIT", "dependencies": { - "get-intrinsic": "^1.2.4", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", - "hasown": "^2.0.1" + "hasown": "^2.0.2" }, "engines": { "node": ">= 0.4" } }, "node_modules/es-shim-unscopables": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/es-shim-unscopables/-/es-shim-unscopables-1.0.2.tgz", - "integrity": "sha512-J3yBRXCzDu4ULnQwxyToo/OjdMx6akgVC7K6few0a7F/0wLtmKKN7I73AH5T2836UuXRqN7Qg+IIUw/+YJksRw==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/es-shim-unscopables/-/es-shim-unscopables-1.1.0.tgz", + "integrity": "sha512-d9T8ucsEhh8Bi1woXCf+TIKDIROLG5WCkxg8geBCbvk22kzwC5G2OnXVMO6FUsvQlgUUXQ2itephWDLqDzbeCw==", "dev": true, + "license": "MIT", "dependencies": { - "hasown": "^2.0.0" + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" } }, "node_modules/es-to-primitive": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/es-to-primitive/-/es-to-primitive-1.2.1.tgz", - "integrity": "sha512-QCOllgZJtaUo9miYBcLChTUaHNjJF3PYs1VidD7AwiEj1kYxKeQTctLAezAOH5ZKRH0g2IgPn6KwB4IT8iRpvA==", + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-to-primitive/-/es-to-primitive-1.3.0.tgz", + "integrity": "sha512-w+5mJ3GuFL+NjVtJlvydShqE1eN3h3PbI7/5LAsYJP/2qtuMXjfL2LpHSRqo4b4eSF5K/DH1JXKUAHSB2UW50g==", "dev": true, + "license": "MIT", "dependencies": { - "is-callable": "^1.1.4", - "is-date-object": "^1.0.1", - "is-symbol": "^1.0.2" + "is-callable": "^1.2.7", + "is-date-object": "^1.0.5", + "is-symbol": "^1.0.4" }, "engines": { "node": ">= 0.4" @@ -8835,25 +9425,24 @@ } }, "node_modules/eslint": { - "version": "9.32.0", - "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.32.0.tgz", - "integrity": "sha512-LSehfdpgMeWcTZkWZVIJl+tkZ2nuSkyyB9C27MZqFWXuph7DvaowgcTvKqxvpLW1JZIk8PN7hFY3Rj9LQ7m7lg==", + "version": "9.39.2", + "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.39.2.tgz", + "integrity": "sha512-LEyamqS7W5HB3ujJyvi0HQK/dtVINZvd5mAAp9eT5S/ujByGjiZLCzPcHVzuXbpJDJF/cxwHlfceVUDZ2lnSTw==", "dev": true, "license": "MIT", "dependencies": { - "@eslint-community/eslint-utils": "^4.2.0", + "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.1", - "@eslint/config-array": "^0.21.0", - "@eslint/config-helpers": "^0.3.0", - "@eslint/core": "^0.15.0", + "@eslint/config-array": "^0.21.1", + "@eslint/config-helpers": "^0.4.2", + "@eslint/core": "^0.17.0", "@eslint/eslintrc": "^3.3.1", - "@eslint/js": "9.32.0", - "@eslint/plugin-kit": "^0.3.4", + "@eslint/js": "9.39.2", + "@eslint/plugin-kit": "^0.4.1", "@humanfs/node": "^0.16.6", "@humanwhocodes/module-importer": "^1.0.1", "@humanwhocodes/retry": "^0.4.2", "@types/estree": "^1.0.6", - "@types/json-schema": "^7.0.15", "ajv": "^6.12.4", "chalk": "^4.0.0", "cross-spawn": "^7.0.6", @@ -8911,6 +9500,81 @@ "eslint": ">=7.0.0" } }, + "node_modules/eslint-import-context": { + "version": "0.1.9", + "resolved": "https://registry.npmjs.org/eslint-import-context/-/eslint-import-context-0.1.9.tgz", + "integrity": "sha512-K9Hb+yRaGAGUbwjhFNHvSmmkZs9+zbuoe3kFQ4V1wYjrepUFYM2dZAfNtjbbj3qsPfUfsA68Bx/ICWQMi+C8Eg==", + "dev": true, + "license": "MIT", + "dependencies": { + "get-tsconfig": "^4.10.1", + "stable-hash-x": "^0.2.0" + }, + "engines": { + "node": "^12.20.0 || ^14.18.0 || >=16.0.0" + }, + "funding": { + "url": "https://opencollective.com/eslint-import-context" + }, + "peerDependencies": { + "unrs-resolver": "^1.0.0" + }, + "peerDependenciesMeta": { + "unrs-resolver": { + "optional": true + } + } + }, + "node_modules/eslint-import-resolver-node": { + "version": "0.3.9", + "resolved": "https://registry.npmjs.org/eslint-import-resolver-node/-/eslint-import-resolver-node-0.3.9.tgz", + "integrity": "sha512-WFj2isz22JahUv+B788TlO3N6zL3nNJGU8CcZbPZvVEkBPaJdCV4vy5wyghty5ROFbCRnm132v8BScu5/1BQ8g==", + "dev": true, + "license": "MIT", + "dependencies": { + "debug": "^3.2.7", + "is-core-module": "^2.13.0", + "resolve": "^1.22.4" + } + }, + "node_modules/eslint-import-resolver-node/node_modules/debug": { + "version": "3.2.7", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", + "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "^2.1.1" + } + }, + "node_modules/eslint-module-utils": { + "version": "2.12.1", + "resolved": "https://registry.npmjs.org/eslint-module-utils/-/eslint-module-utils-2.12.1.tgz", + "integrity": "sha512-L8jSWTze7K2mTg0vos/RuLRS5soomksDPoJLXIslC7c8Wmut3bx7CPpJijDcBZtxQ5lrbUdM+s0OlNbz0DCDNw==", + "dev": true, + "license": "MIT", + "dependencies": { + "debug": "^3.2.7" + }, + "engines": { + "node": ">=4" + }, + "peerDependenciesMeta": { + "eslint": { + "optional": true + } + } + }, + "node_modules/eslint-module-utils/node_modules/debug": { + "version": "3.2.7", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", + "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "^2.1.1" + } + }, "node_modules/eslint-plugin-i18n": { "version": "2.4.0", "resolved": "https://registry.npmjs.org/eslint-plugin-i18n/-/eslint-plugin-i18n-2.4.0.tgz", @@ -8920,6 +9584,130 @@ "node": ">=12.0.0" } }, + "node_modules/eslint-plugin-import": { + "version": "2.32.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.32.0.tgz", + "integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@rtsao/scc": "^1.1.0", + "array-includes": "^3.1.9", + "array.prototype.findlastindex": "^1.2.6", + "array.prototype.flat": "^1.3.3", + "array.prototype.flatmap": "^1.3.3", + "debug": "^3.2.7", + "doctrine": "^2.1.0", + "eslint-import-resolver-node": "^0.3.9", + "eslint-module-utils": "^2.12.1", + "hasown": "^2.0.2", + "is-core-module": "^2.16.1", + "is-glob": "^4.0.3", + "minimatch": "^3.1.2", + "object.fromentries": "^2.0.8", + "object.groupby": "^1.0.3", + "object.values": "^1.2.1", + "semver": "^6.3.1", + "string.prototype.trimend": "^1.0.9", + "tsconfig-paths": "^3.15.0" + }, + "engines": { + "node": ">=4" + }, + "peerDependencies": { + "eslint": "^2 || ^3 || ^4 || ^5 || ^6 || ^7.2.0 || ^8 || ^9" + } + }, + "node_modules/eslint-plugin-import-x": { + "version": "4.16.1", + "resolved": "https://registry.npmjs.org/eslint-plugin-import-x/-/eslint-plugin-import-x-4.16.1.tgz", + "integrity": "sha512-vPZZsiOKaBAIATpFE2uMI4w5IRwdv/FpQ+qZZMR4E+PeOcM4OeoEbqxRMnywdxP19TyB/3h6QBB0EWon7letSQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/types": "^8.35.0", + "comment-parser": "^1.4.1", + "debug": "^4.4.1", + "eslint-import-context": "^0.1.9", + "is-glob": "^4.0.3", + "minimatch": "^9.0.3 || ^10.0.1", + "semver": "^7.7.2", + "stable-hash-x": "^0.2.0", + "unrs-resolver": "^1.9.2" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://opencollective.com/eslint-plugin-import-x" + }, + "peerDependencies": { + "@typescript-eslint/utils": "^8.0.0", + "eslint": "^8.57.0 || ^9.0.0", + "eslint-import-resolver-node": "*" + }, + "peerDependenciesMeta": { + "@typescript-eslint/utils": { + "optional": true + }, + "eslint-import-resolver-node": { + "optional": true + } + } + }, + "node_modules/eslint-plugin-import-x/node_modules/@typescript-eslint/types": { + "version": "8.53.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.53.1.tgz", + "integrity": "sha512-jr/swrr2aRmUAUjW5/zQHbMaui//vQlsZcJKijZf3M26bnmLj8LyZUpj8/Rd6uzaek06OWsqdofN/Thenm5O8A==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + } + }, + "node_modules/eslint-plugin-import-x/node_modules/minimatch": { + "version": "10.1.1", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.1.1.tgz", + "integrity": "sha512-enIvLvRAFZYXJzkCYG5RKmPfrFArdLv+R+lbQ53BmIMLIry74bjKzX6iHAm8WYamJkhSSEabrWN5D97XnKObjQ==", + "dev": true, + "license": "BlueOak-1.0.0", + "dependencies": { + "@isaacs/brace-expansion": "^5.0.0" + }, + "engines": { + "node": "20 || >=22" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/eslint-plugin-import-x/node_modules/semver": { + "version": "7.7.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz", + "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==", + "dev": true, + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/eslint-plugin-import/node_modules/debug": { + "version": "3.2.7", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", + "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "^2.1.1" + } + }, "node_modules/eslint-plugin-prettier": { "version": "5.4.1", "resolved": "https://registry.npmjs.org/eslint-plugin-prettier/-/eslint-plugin-prettier-5.4.1.tgz", @@ -8951,6 +9739,67 @@ } } }, + "node_modules/eslint-plugin-react": { + "version": "7.37.5", + "resolved": "https://registry.npmjs.org/eslint-plugin-react/-/eslint-plugin-react-7.37.5.tgz", + "integrity": "sha512-Qteup0SqU15kdocexFNAJMvCJEfa2xUKNV4CC1xsVMrIIqEy3SQ/rqyxCWNzfrd3/ldy6HMlD2e0JDVpDg2qIA==", + "dev": true, + "license": "MIT", + "dependencies": { + "array-includes": "^3.1.8", + "array.prototype.findlast": "^1.2.5", + "array.prototype.flatmap": "^1.3.3", + "array.prototype.tosorted": "^1.1.4", + "doctrine": "^2.1.0", + "es-iterator-helpers": "^1.2.1", + "estraverse": "^5.3.0", + "hasown": "^2.0.2", + "jsx-ast-utils": "^2.4.1 || ^3.0.0", + "minimatch": "^3.1.2", + "object.entries": "^1.1.9", + "object.fromentries": "^2.0.8", + "object.values": "^1.2.1", + "prop-types": "^15.8.1", + "resolve": "^2.0.0-next.5", + "semver": "^6.3.1", + "string.prototype.matchall": "^4.0.12", + "string.prototype.repeat": "^1.0.0" + }, + "engines": { + "node": ">=4" + }, + "peerDependencies": { + "eslint": "^3 || ^4 || ^5 || ^6 || ^7 || ^8 || ^9.7" + } + }, + "node_modules/eslint-plugin-react/node_modules/estraverse": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", + "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=4.0" + } + }, + "node_modules/eslint-plugin-react/node_modules/resolve": { + "version": "2.0.0-next.5", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-2.0.0-next.5.tgz", + "integrity": "sha512-U7WjGVG9sH8tvjW5SmGbQuui75FiyjAX72HX15DwBBwF9dNiQZRQAg9nnPhYy+TUnE0+VcrttuvNI8oSxZcocA==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-core-module": "^2.13.0", + "path-parse": "^1.0.7", + "supports-preserve-symlinks-flag": "^1.0.0" + }, + "bin": { + "resolve": "bin/resolve" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/eslint-plugin-simple-import-sort": { "version": "12.1.1", "resolved": "https://registry.npmjs.org/eslint-plugin-simple-import-sort/-/eslint-plugin-simple-import-sort-12.1.1.tgz", @@ -9692,12 +10541,19 @@ } }, "node_modules/for-each": { - "version": "0.3.3", - "resolved": "https://registry.npmjs.org/for-each/-/for-each-0.3.3.tgz", - "integrity": "sha512-jqYfLp7mo9vIyQf8ykW2v7A+2N4QjeCeI5+Dz9XraiO1ign81wjiH7Fb9vSOWvQfNtmSa4H2RoQTrrXivdUZmw==", + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/for-each/-/for-each-0.3.5.tgz", + "integrity": "sha512-dKx12eRCVIzqCxFGplyFKJMPvLEWgmNtUrpTiJIR5u97zEhRG8ySrtboPHZXx7daLxQVrl643cTzbab2tkQjxg==", "dev": true, + "license": "MIT", "dependencies": { - "is-callable": "^1.1.3" + "is-callable": "^1.2.7" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, "node_modules/fork-ts-checker-webpack-plugin": { @@ -9943,15 +10799,18 @@ } }, "node_modules/function.prototype.name": { - "version": "1.1.6", - "resolved": "https://registry.npmjs.org/function.prototype.name/-/function.prototype.name-1.1.6.tgz", - "integrity": "sha512-Z5kx79swU5P27WEayXM1tBi5Ze/lbIyiNgU3qyXUOf9b2rgXYyF9Dy9Cx+IQv/Lc8WCG6L82zwUPpSS9hGehIg==", + "version": "1.1.8", + "resolved": "https://registry.npmjs.org/function.prototype.name/-/function.prototype.name-1.1.8.tgz", + "integrity": "sha512-e5iwyodOHhbMr/yNrc7fDYG4qlbIvI5gajyzPnb5TCwyhjApznQh1BMFou9b30SevY43gCJKXycoCBjMbsuW0Q==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.2", - "define-properties": "^1.2.0", - "es-abstract": "^1.22.1", - "functions-have-names": "^1.2.3" + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "define-properties": "^1.2.1", + "functions-have-names": "^1.2.3", + "hasown": "^2.0.2", + "is-callable": "^1.2.7" }, "engines": { "node": ">= 0.4" @@ -9969,6 +10828,16 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/generator-function": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/generator-function/-/generator-function-2.0.1.tgz", + "integrity": "sha512-SFdFmIJi+ybC0vjlHN0ZGVGHc3lgE0DxPAT0djjVg+kjOnSqclqmj0KQ7ykTOLP6YxoqOvuAODGdcHJn+43q3g==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, "node_modules/gensync": { "version": "1.0.0-beta.2", "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", @@ -10001,16 +10870,22 @@ } }, "node_modules/get-intrinsic": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz", - "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==", + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", "dev": true, + "license": "MIT", "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", - "has-proto": "^1.0.1", - "has-symbols": "^1.0.3", - "hasown": "^2.0.0" + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" }, "engines": { "node": ">= 0.4" @@ -10025,7 +10900,21 @@ "integrity": "sha512-pjzuKtY64GYfWizNAJ0fr9VqttZkNiK2iS430LtIHzjBEr6bX8Am2zm4sW4Ro5wjWW5cAlRL1qAMTcXbjNAO2Q==", "dev": true, "engines": { - "node": ">=8.0.0" + "node": ">=8.0.0" + } + }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "dev": true, + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" } }, "node_modules/get-stream": { @@ -10041,14 +10930,15 @@ } }, "node_modules/get-symbol-description": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/get-symbol-description/-/get-symbol-description-1.0.2.tgz", - "integrity": "sha512-g0QYk1dZBxGwk+Ngc+ltRH2IBp2f7zBkBMBJZCDerh6EhlhSR6+9irMCuT/09zD6qkarHUSn529sK/yL4S27mg==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/get-symbol-description/-/get-symbol-description-1.1.0.tgz", + "integrity": "sha512-w9UMqWwJxHNOvoNzSJ2oPF5wvYcvP7jUvYzhp67yEhTi17ZDBBC1z9pTdGuzjD+EFIqLSYRweZjqfiPzQ06Ebg==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.5", + "call-bound": "^1.0.3", "es-errors": "^1.3.0", - "get-intrinsic": "^1.2.4" + "get-intrinsic": "^1.2.6" }, "engines": { "node": ">= 0.4" @@ -10057,6 +10947,19 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/get-tsconfig": { + "version": "4.13.0", + "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.0.tgz", + "integrity": "sha512-1VKTZJCwBrvbd+Wn3AOgQP/2Av+TfTCOlE4AcRJE72W1ksZXbAx8PPBR9RzgTeSPzlPMHrbANMH3LbltH73wxQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "resolve-pkg-maps": "^1.0.0" + }, + "funding": { + "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" + } + }, "node_modules/glob": { "version": "7.2.3", "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz", @@ -10188,12 +11091,13 @@ } }, "node_modules/gopd": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.0.1.tgz", - "integrity": "sha512-d65bNlIadxvpb/A2abVdlqKqV563juRnZ1Wtk6s1sIR8uNsXR70xqIzVqxVf1eTqDunwT2MkczEeaezCKTZhwA==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", "dev": true, - "dependencies": { - "get-intrinsic": "^1.1.3" + "license": "MIT", + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" @@ -10247,10 +11151,14 @@ } }, "node_modules/has-bigints": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/has-bigints/-/has-bigints-1.0.2.tgz", - "integrity": "sha512-tSvCKtBr9lkF0Ex0aQiP9N+OpV4zi2r/Nee5VkRDbaqv35RLYMzbwQfFSZZH0kR+Rd6302UJZ2p/bJCEoR3VoQ==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-bigints/-/has-bigints-1.1.0.tgz", + "integrity": "sha512-R3pbpkcIqv2Pm3dUwgjclDRVmWpTJW2DcMzcIhEXEx1oh/CEMObMm3KLmRJOdvhM7o4uQBnwr8pzRK2sJWIqfg==", "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, "funding": { "url": "https://github.com/sponsors/ljharb" } @@ -10276,10 +11184,14 @@ } }, "node_modules/has-proto": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.0.3.tgz", - "integrity": "sha512-SJ1amZAJUiZS+PhsVLf5tGydlaVB8EdFpaSO4gmiUKUOxk8qzn5AIy4ZeJUmh22znIdk/uMAUT2pl3FxzVUH+Q==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.2.0.tgz", + "integrity": "sha512-KIL7eQPfHQRC8+XluaIw7BHUwwqL19bQn4hzNgdr+1wXoU0KKj6rufu47lhY7KbJR2C6T6+PfyN0Ea7wkSS+qQ==", "dev": true, + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.0" + }, "engines": { "node": ">= 0.4" }, @@ -10288,10 +11200,11 @@ } }, "node_modules/has-symbols": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.3.tgz", - "integrity": "sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.4" }, @@ -10849,14 +11762,15 @@ "dev": true }, "node_modules/internal-slot": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/internal-slot/-/internal-slot-1.0.7.tgz", - "integrity": "sha512-NGnrKwXzSms2qUUih/ILZ5JBqNTSa1+ZmP6flaIp6KmSElgE9qdndzS3cqjrDovwFdmwsGsLdeFgB6suw+1e9g==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/internal-slot/-/internal-slot-1.1.0.tgz", + "integrity": "sha512-4gd7VpWNQNB4UKKCFFVcp1AVv+FMOgs9NKzjHKusc8jTMhd5eL1NqQqOpE0KzMds804/yHlglp3uxgluOqAPLw==", "dev": true, + "license": "MIT", "dependencies": { "es-errors": "^1.3.0", - "hasown": "^2.0.0", - "side-channel": "^1.0.4" + "hasown": "^2.0.2", + "side-channel": "^1.1.0" }, "engines": { "node": ">= 0.4" @@ -10892,13 +11806,15 @@ } }, "node_modules/is-array-buffer": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/is-array-buffer/-/is-array-buffer-3.0.4.tgz", - "integrity": "sha512-wcjaerHw0ydZwfhiKbXJWLDY8A7yV7KhjQOpb83hGgGfId/aQa4TOvwyzn2PuswW2gPCYEL/nEAiSVpdOj1lXw==", + "version": "3.0.5", + "resolved": "https://registry.npmjs.org/is-array-buffer/-/is-array-buffer-3.0.5.tgz", + "integrity": "sha512-DDfANUiiG2wC1qawP66qlTugJeL5HyzMpfr8lLK+jMQirGzNod0B12cFB/9q838Ru27sBwfw78/rdoU7RERz6A==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.2", - "get-intrinsic": "^1.2.1" + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "get-intrinsic": "^1.2.6" }, "engines": { "node": ">= 0.4" @@ -10913,13 +11829,37 @@ "integrity": "sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==", "dev": true }, + "node_modules/is-async-function": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-async-function/-/is-async-function-2.1.1.tgz", + "integrity": "sha512-9dgM/cZBnNvjzaMYHVoxxfPj2QXt22Ev7SuuPrs+xav0ukGB0S6d4ydZdEiM48kLx5kDV+QBPrpVnFyefL8kkQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "async-function": "^1.0.0", + "call-bound": "^1.0.3", + "get-proto": "^1.0.1", + "has-tostringtag": "^1.0.2", + "safe-regex-test": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/is-bigint": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/is-bigint/-/is-bigint-1.0.4.tgz", - "integrity": "sha512-zB9CruMamjym81i2JZ3UMn54PKGsQzsJeo6xvN3HJJ4CAsQNB6iRutp2To77OfCNuoxspsIhzaPoO1zyCEhFOg==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-bigint/-/is-bigint-1.1.0.tgz", + "integrity": "sha512-n4ZT37wG78iz03xPRKJrHTdZbe3IicyucEtdRsV5yglwc3GyUfbAfpSeD0FJ41NbUNSt5wbhqfp1fS+BgnvDFQ==", "dev": true, + "license": "MIT", "dependencies": { - "has-bigints": "^1.0.1" + "has-bigints": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" @@ -10938,13 +11878,14 @@ } }, "node_modules/is-boolean-object": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/is-boolean-object/-/is-boolean-object-1.1.2.tgz", - "integrity": "sha512-gDYaKHJmnj4aWxyj6YHyXVpdQawtVLHU5cb+eztPGczf6cjuTdwve5ZIEfgXqH4e57An1D1AKf8CZ3kYrQRqYA==", + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/is-boolean-object/-/is-boolean-object-1.2.2.tgz", + "integrity": "sha512-wa56o2/ElJMYqjCjGkXri7it5FbebW5usLw/nPmCMs5DeZ7eziSYZhSmPRn0txqeW4LnAmQQU7FgqLpsEFKM4A==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.2", - "has-tostringtag": "^1.0.0" + "call-bound": "^1.0.3", + "has-tostringtag": "^1.0.2" }, "engines": { "node": ">= 0.4" @@ -10971,10 +11912,11 @@ } }, "node_modules/is-core-module": { - "version": "2.15.1", - "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.15.1.tgz", - "integrity": "sha512-z0vtXSwucUJtANQWldhbtbt7BnL0vxiFjIdDLAatwhDYty2bad6s+rijD6Ri4YuYJubLzIJLUidCh09e1djEVQ==", + "version": "2.16.1", + "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.1.tgz", + "integrity": "sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==", "dev": true, + "license": "MIT", "dependencies": { "hasown": "^2.0.2" }, @@ -10986,11 +11928,14 @@ } }, "node_modules/is-data-view": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/is-data-view/-/is-data-view-1.0.1.tgz", - "integrity": "sha512-AHkaJrsUVW6wq6JS8y3JnM/GJF/9cf+k20+iDzlSaJrinEo5+7vRiteOSwBhHRiAyQATN1AmY4hwzxJKPmYf+w==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-data-view/-/is-data-view-1.0.2.tgz", + "integrity": "sha512-RKtWF8pGmS87i2D6gqQu/l7EYRlVdfzemCJN/P3UOs//x1QE7mfhvzHIApBTRf7axvT6DMGwSwBXYCT0nfB9xw==", "dev": true, + "license": "MIT", "dependencies": { + "call-bound": "^1.0.2", + "get-intrinsic": "^1.2.6", "is-typed-array": "^1.1.13" }, "engines": { @@ -11001,12 +11946,14 @@ } }, "node_modules/is-date-object": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.0.5.tgz", - "integrity": "sha512-9YQaSxsAiSwcvS33MBk3wTCVnWK+HhF8VZR2jRxehM16QcVOdHqPn4VPHmRK4lSr38n9JriurInLcP90xsYNfQ==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.1.0.tgz", + "integrity": "sha512-PwwhEakHVKTdRNVOw+/Gyh0+MzlCl4R6qKvkhuvLtPMggI1WAHt9sOwZxQLSGpUaDnrdyDsomoRgNnCfKNSXXg==", "dev": true, + "license": "MIT", "dependencies": { - "has-tostringtag": "^1.0.0" + "call-bound": "^1.0.2", + "has-tostringtag": "^1.0.2" }, "engines": { "node": ">= 0.4" @@ -11039,6 +11986,22 @@ "node": ">=0.10.0" } }, + "node_modules/is-finalizationregistry": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-finalizationregistry/-/is-finalizationregistry-1.1.1.tgz", + "integrity": "sha512-1pC6N8qWJbWoPtEjgcL2xyhQOP491EQjeUo3qTKcmV8YSDDJrOepfG8pcC7h/QgnQHYSv0mJ3Z/ZWxmatVrysg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/is-fullwidth-code-point": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", @@ -11057,6 +12020,26 @@ "node": ">=6" } }, + "node_modules/is-generator-function": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/is-generator-function/-/is-generator-function-1.1.2.tgz", + "integrity": "sha512-upqt1SkGkODW9tsGNG5mtXTXtECizwtS2kA161M+gJPc1xdb/Ax629af6YrTwcOeQHbewrPNlE5Dx7kzvXTizA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.4", + "generator-function": "^2.0.0", + "get-proto": "^1.0.1", + "has-tostringtag": "^1.0.2", + "safe-regex-test": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/is-glob": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", @@ -11102,6 +12085,19 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/is-map": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/is-map/-/is-map-2.0.3.tgz", + "integrity": "sha512-1Qed0/Hr2m+YqxnM09CjA2d/i6YZNfF6R2oRAOj36eUdS6qIV/huPJNSEpKbupewFs+ZsJlxsjjPbc0/afW6Lw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/is-negative-zero": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/is-negative-zero/-/is-negative-zero-2.0.3.tgz", @@ -11136,12 +12132,14 @@ } }, "node_modules/is-number-object": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/is-number-object/-/is-number-object-1.0.7.tgz", - "integrity": "sha512-k1U0IRzLMo7ZlYIfzRu23Oh6MiIFasgpb9X76eqfFZAqwH44UI4KTBvBYIZ1dSL9ZzChTB9ShHfLkR4pdW5krQ==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-number-object/-/is-number-object-1.1.1.tgz", + "integrity": "sha512-lZhclumE1G6VYD8VHe35wFaIif+CTy5SJIi5+3y4psDgWu4wPDoBhF8NxUOinEc7pHgiTsT6MaBb92rKhhD+Xw==", "dev": true, + "license": "MIT", "dependencies": { - "has-tostringtag": "^1.0.0" + "call-bound": "^1.0.3", + "has-tostringtag": "^1.0.2" }, "engines": { "node": ">= 0.4" @@ -11175,13 +12173,16 @@ } }, "node_modules/is-regex": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.4.tgz", - "integrity": "sha512-kvRdxDsxZjhzUX07ZnLydzS1TU/TJlTUHHY4YLL87e37oUA49DfkLqgy+VjFocowy29cKvcSiu+kIv728jTTVg==", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.2.1.tgz", + "integrity": "sha512-MjYsKHO5O7mCsmRGxWcLWheFqN9DJ/2TmngvjKXihe6efViPqc274+Fx/4fYj/r03+ESvBdTXK0V6tA3rgez1g==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.2", - "has-tostringtag": "^1.0.0" + "call-bound": "^1.0.2", + "gopd": "^1.2.0", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" }, "engines": { "node": ">= 0.4" @@ -11204,13 +12205,27 @@ "node": ">=6" } }, + "node_modules/is-set": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/is-set/-/is-set-2.0.3.tgz", + "integrity": "sha512-iPAjerrse27/ygGLxw+EBR9agv9Y6uLeYVJMu+QNCoouJ1/1ri0mGrcWpfCqFZuzzx3WjtwxG098X+n4OuRkPg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/is-shared-array-buffer": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/is-shared-array-buffer/-/is-shared-array-buffer-1.0.3.tgz", - "integrity": "sha512-nA2hv5XIhLR3uVzDDfCIknerhx8XUKnstuOERPNNIinXG7v9u+ohXF67vxm4TPTEPU6lm61ZkwP3c9PCB97rhg==", + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/is-shared-array-buffer/-/is-shared-array-buffer-1.0.4.tgz", + "integrity": "sha512-ISWac8drv4ZGfwKl5slpHG9OwPNty4jOWPRIhBpxOoD+hqITiwuipOQ2bNthAzwA3B4fIjO4Nln74N0S9byq8A==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.7" + "call-bound": "^1.0.3" }, "engines": { "node": ">= 0.4" @@ -11232,12 +12247,14 @@ } }, "node_modules/is-string": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.0.7.tgz", - "integrity": "sha512-tE2UXzivje6ofPW7l23cjDOMa09gb7xlAqG6jG5ej6uPV32TlWP3NKPigtaGeHNu9fohccRYvIiZMfOOnOYUtg==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.1.1.tgz", + "integrity": "sha512-BtEeSsoaQjlSPBemMQIrY1MY0uM6vnS1g5fmufYOtnxLGUZM2178PKbhsk7Ffv58IX+ZtcvoGwccYsh0PglkAA==", "dev": true, + "license": "MIT", "dependencies": { - "has-tostringtag": "^1.0.0" + "call-bound": "^1.0.3", + "has-tostringtag": "^1.0.2" }, "engines": { "node": ">= 0.4" @@ -11253,12 +12270,15 @@ "dev": true }, "node_modules/is-symbol": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/is-symbol/-/is-symbol-1.0.4.tgz", - "integrity": "sha512-C/CPBqKWnvdcxqIARxyOh4v1UUEOCHpgDa0WYgpKDFMszcrPcffg5uhwSgPCLD2WWxmq6isisz87tzT01tuGhg==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-symbol/-/is-symbol-1.1.1.tgz", + "integrity": "sha512-9gGx6GTtCQM73BgmHQXfDmLtfjjTUDSyoxTCbp5WtoixAhfgsDirWIcVQ/IHpvI5Vgd5i/J5F7B9cN/WlVbC/w==", "dev": true, + "license": "MIT", "dependencies": { - "has-symbols": "^1.0.2" + "call-bound": "^1.0.2", + "has-symbols": "^1.1.0", + "safe-regex-test": "^1.1.0" }, "engines": { "node": ">= 0.4" @@ -11268,12 +12288,13 @@ } }, "node_modules/is-typed-array": { - "version": "1.1.13", - "resolved": "https://registry.npmjs.org/is-typed-array/-/is-typed-array-1.1.13.tgz", - "integrity": "sha512-uZ25/bUAlUY5fR4OKT4rZQEBrzQWYV9ZJYGGsUmEJ6thodVJ1HX64ePQ6Z0qPWP+m+Uq6e9UugrE38jeYsDSMw==", + "version": "1.1.15", + "resolved": "https://registry.npmjs.org/is-typed-array/-/is-typed-array-1.1.15.tgz", + "integrity": "sha512-p3EcsicXjit7SaskXHs1hA91QxgTw46Fv6EFKKGS5DRFLD8yKnohjF3hxoju94b/OcMZoQukzpPpBE9uLVKzgQ==", "dev": true, + "license": "MIT", "dependencies": { - "which-typed-array": "^1.1.14" + "which-typed-array": "^1.1.16" }, "engines": { "node": ">= 0.4" @@ -11282,13 +12303,47 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/is-weakmap": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/is-weakmap/-/is-weakmap-2.0.2.tgz", + "integrity": "sha512-K5pXYOm9wqY1RgjpL3YTkF39tni1XajUIkawTLUo9EZEVUFga5gSQJF8nNS7ZwJQ02y+1YCNYcMh+HIf1ZqE+w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/is-weakref": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-weakref/-/is-weakref-1.0.2.tgz", - "integrity": "sha512-qctsuLZmIQ0+vSSMfoVvyFe2+GSEvnmZ2ezTup1SBse9+twCCeial6EEi3Nc2KFcf6+qz2FBPnjXsk8xhKSaPQ==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-weakref/-/is-weakref-1.1.1.tgz", + "integrity": "sha512-6i9mGWSlqzNMEqpCp93KwRS1uUOodk2OJ6b+sq7ZPDSy2WuI5NFIxp/254TytR8ftefexkWn5xNiHUNpPOfSew==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.2" + "call-bound": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-weakset": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-weakset/-/is-weakset-2.0.4.tgz", + "integrity": "sha512-mfcwb6IzQyOKTs84CQMrOwW4gQcaTOAWJ0zzJCl2WSPDrWk/OzDaImWFH3djXhb24g4eudZfLRozAvPGw4d9hQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "get-intrinsic": "^1.2.6" + }, + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" @@ -11310,7 +12365,8 @@ "version": "2.0.5", "resolved": "https://registry.npmjs.org/isarray/-/isarray-2.0.5.tgz", "integrity": "sha512-xHjhDr3cNBK0BzdUJSPXZntQUx/mwMS5Rw4A7lPJ90XGAO6ISP/ePDNuo0vhqOZU+UD5JoodwCAAoZQd3FeAKw==", - "dev": true + "dev": true, + "license": "MIT" }, "node_modules/isexe": { "version": "2.0.0", @@ -11441,6 +12497,24 @@ "node": ">=8" } }, + "node_modules/iterator.prototype": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/iterator.prototype/-/iterator.prototype-1.1.5.tgz", + "integrity": "sha512-H0dkQoCa3b2VEeKQBOxFph+JAbcrQdE7KC0UkqwpLmv2EC4P41QXP+rqo9wYodACiG5/WM5s9oDApTU8utwj9g==", + "dev": true, + "license": "MIT", + "dependencies": { + "define-data-property": "^1.1.4", + "es-object-atoms": "^1.0.0", + "get-intrinsic": "^1.2.6", + "get-proto": "^1.0.0", + "has-symbols": "^1.1.0", + "set-function-name": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/jake": { "version": "10.9.2", "resolved": "https://registry.npmjs.org/jake/-/jake-10.9.2.tgz", @@ -12409,6 +13483,22 @@ "graceful-fs": "^4.1.6" } }, + "node_modules/jsx-ast-utils": { + "version": "3.3.5", + "resolved": "https://registry.npmjs.org/jsx-ast-utils/-/jsx-ast-utils-3.3.5.tgz", + "integrity": "sha512-ZZow9HBI5O6EPgSJLUb8n2NKgmVWTwCvHGwFuJlMjvLFqlGG6pjirPhtdsseaLZjSibD8eegzmYpUZwoIlj2cQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "array-includes": "^3.1.6", + "array.prototype.flat": "^1.3.1", + "object.assign": "^4.1.4", + "object.values": "^1.1.6" + }, + "engines": { + "node": ">=4.0" + } + }, "node_modules/keyv": { "version": "4.5.4", "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz", @@ -12988,6 +14078,16 @@ "tmpl": "1.0.5" } }, + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, "node_modules/md5": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/md5/-/md5-2.3.0.tgz", @@ -13248,6 +14348,22 @@ "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" } }, + "node_modules/napi-postinstall": { + "version": "0.3.4", + "resolved": "https://registry.npmjs.org/napi-postinstall/-/napi-postinstall-0.3.4.tgz", + "integrity": "sha512-PHI5f1O0EP5xJ9gQmFGMS6IZcrVvTjpXjz7Na41gTE7eE2hK11lg04CECCYEEjdc17EV4DO+fkGEtt7TpTaTiQ==", + "dev": true, + "license": "MIT", + "bin": { + "napi-postinstall": "lib/cli.js" + }, + "engines": { + "node": "^12.20.0 || ^14.18.0 || >=16.0.0" + }, + "funding": { + "url": "https://opencollective.com/napi-postinstall" + } + }, "node_modules/natural-compare": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", @@ -18584,10 +19700,11 @@ } }, "node_modules/object-inspect": { - "version": "1.13.2", - "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz", - "integrity": "sha512-IRZSRuzJiynemAXPYtPe5BoI/RESNYR7TYm50MC5Mqbd3Jmw5y790sErYw3V6SryFJD64b74qQQs9wn5Bg/k3g==", + "version": "1.13.4", + "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz", + "integrity": "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.4" }, @@ -18621,14 +19738,17 @@ } }, "node_modules/object.assign": { - "version": "4.1.5", - "resolved": "https://registry.npmjs.org/object.assign/-/object.assign-4.1.5.tgz", - "integrity": "sha512-byy+U7gp+FVwmyzKPYhW2h5l3crpmGsxl7X2s8y43IgxvG4g3QZ6CffDtsNQy1WsmZpQbO+ybo0AlW7TY6DcBQ==", + "version": "4.1.7", + "resolved": "https://registry.npmjs.org/object.assign/-/object.assign-4.1.7.tgz", + "integrity": "sha512-nK28WOo+QIjBkDduTINE4JkF/UJJKyf2EJxvJKfblDpyg0Q+pkOHNTL0Qwy6NP6FhE/EnzV73BxxqcJaXY9anw==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.5", + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", "define-properties": "^1.2.1", - "has-symbols": "^1.0.3", + "es-object-atoms": "^1.0.0", + "has-symbols": "^1.1.0", "object-keys": "^1.1.1" }, "engines": { @@ -18639,27 +19759,65 @@ } }, "node_modules/object.entries": { - "version": "1.1.8", - "resolved": "https://registry.npmjs.org/object.entries/-/object.entries-1.1.8.tgz", - "integrity": "sha512-cmopxi8VwRIAw/fkijJohSfpef5PdN0pMQJN6VC/ZKvn0LIknWD8KtgY6KlQdEc4tIjcQ3HxSMmnvtzIscdaYQ==", + "version": "1.1.9", + "resolved": "https://registry.npmjs.org/object.entries/-/object.entries-1.1.9.tgz", + "integrity": "sha512-8u/hfXFRBD1O0hPUjioLhoWFHRmt6tKA4/vZPyckBr18l1KE9uHrFaFaUi8MDRTpi4uak2goyPTSNJLXX2k2Hw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "define-properties": "^1.2.1", + "es-object-atoms": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/object.fromentries": { + "version": "2.0.8", + "resolved": "https://registry.npmjs.org/object.fromentries/-/object.fromentries-2.0.8.tgz", + "integrity": "sha512-k6E21FzySsSK5a21KRADBd/NGneRegFO5pLHfdQLpRDETUNJueLXs3WCzyQ3tFRDYgbq3KHGXfTbi2bs8WQ6rQ==", "dev": true, + "license": "MIT", "dependencies": { "call-bind": "^1.0.7", "define-properties": "^1.2.1", + "es-abstract": "^1.23.2", "es-object-atoms": "^1.0.0" }, "engines": { "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/object.values": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/object.values/-/object.values-1.2.0.tgz", - "integrity": "sha512-yBYjY9QX2hnRmZHAjG/f13MzmBzxzYgQhFrke06TTyKY5zSTEqkOeukBzIdVA3j3ulu8Qa3MbVFShV7T2RmGtQ==", + "node_modules/object.groupby": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/object.groupby/-/object.groupby-1.0.3.tgz", + "integrity": "sha512-+Lhy3TQTuzXI5hevh8sBGqbmurHbbIjAi0Z4S63nthVLmLxfbj4T54a4CfZrXIrt9iP4mVAPYMo/v99taj3wjQ==", "dev": true, + "license": "MIT", "dependencies": { "call-bind": "^1.0.7", "define-properties": "^1.2.1", + "es-abstract": "^1.23.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/object.values": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/object.values/-/object.values-1.2.1.tgz", + "integrity": "sha512-gXah6aZrcUxjWg2zR2MwouP2eHlCBzdV4pygudehaKXSGW4v2AsRQUK+lwwXhii6KFZcunEnmSUoYp5CXibxtA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "define-properties": "^1.2.1", "es-object-atoms": "^1.0.0" }, "engines": { @@ -18799,6 +19957,24 @@ "node": ">= 0.8.0" } }, + "node_modules/own-keys": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/own-keys/-/own-keys-1.0.1.tgz", + "integrity": "sha512-qFOyK5PjiWZd+QQIh+1jhdb9LpxTF0qs7Pm8o5QHYZ0M3vKqSqzsZaEB6oWlxZ+q2sJBMI/Ktgd2N5ZwQoRHfg==", + "dev": true, + "license": "MIT", + "dependencies": { + "get-intrinsic": "^1.2.6", + "object-keys": "^1.1.1", + "safe-push-apply": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/p-limit": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", @@ -19211,10 +20387,11 @@ } }, "node_modules/possible-typed-array-names": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.0.0.tgz", - "integrity": "sha512-d7Uw+eZoloe0EHDIYoe+bQ5WXnGMOpmiZFTuMWCwpjzzkL2nTjcKiAk4hh8TjnGye2TwWOk3UXucZ+3rbmBa8Q==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.1.0.tgz", + "integrity": "sha512-/+5VFTchJDoVj3bhoqi6UeymcD00DAwb1nJwamzPvHEszJ4FpF6SNNbUbOS8yI56qHzdV8eK0qEfOSiodkTdxg==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.4" } @@ -21020,6 +22197,29 @@ "resolved": "https://registry.npmjs.org/redux/-/redux-5.0.1.tgz", "integrity": "sha512-M9/ELqF6fy8FwmkpnF0S3YKOqMyoWJ4+CS5Efg2ct3oY9daQvd/Pc71FpGZsVsbl3Cpb+IIcjBDUnnyBdQbq4w==" }, + "node_modules/reflect.getprototypeof": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/reflect.getprototypeof/-/reflect.getprototypeof-1.0.10.tgz", + "integrity": "sha512-00o4I+DVrefhv+nX0ulyi3biSHCPDe+yLv5o/p6d/UVlirijB8E16FtfwSAi4g3tcqrQ4lRAqQSoFEZJehYEcw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.9", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0", + "get-intrinsic": "^1.2.7", + "get-proto": "^1.0.1", + "which-builtin-type": "^1.2.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/reftools": { "version": "1.1.9", "resolved": "https://registry.npmjs.org/reftools/-/reftools-1.1.9.tgz", @@ -21068,14 +22268,17 @@ "dev": true }, "node_modules/regexp.prototype.flags": { - "version": "1.5.3", - "resolved": "https://registry.npmjs.org/regexp.prototype.flags/-/regexp.prototype.flags-1.5.3.tgz", - "integrity": "sha512-vqlC04+RQoFalODCbCumG2xIOvapzVMHwsyIGM/SIE8fRhFFsXeH8/QQ+s0T0kDAhKc4k30s73/0ydkHQz6HlQ==", + "version": "1.5.4", + "resolved": "https://registry.npmjs.org/regexp.prototype.flags/-/regexp.prototype.flags-1.5.4.tgz", + "integrity": "sha512-dYqgNSZbDwkaJ2ceRd9ojCGjBq+mOm9LmtXnAnEGyHhN/5R7iDW2TRw3h+o/jCFxus3P2LfWIIiwowAjANm7IA==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.7", + "call-bind": "^1.0.8", "define-properties": "^1.2.1", "es-errors": "^1.3.0", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", "set-function-name": "^2.0.2" }, "engines": { @@ -21309,6 +22512,16 @@ "node": ">=4" } }, + "node_modules/resolve-pkg-maps": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz", + "integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" + } + }, "node_modules/resolve-url-loader": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/resolve-url-loader/-/resolve-url-loader-5.0.0.tgz", @@ -21490,14 +22703,16 @@ } }, "node_modules/safe-array-concat": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/safe-array-concat/-/safe-array-concat-1.1.2.tgz", - "integrity": "sha512-vj6RsCsWBCf19jIeHEfkRMw8DPiBb+DMXklQ/1SGDHOMlHdPUkZXFQ2YdplS23zESTijAcurb1aSgJA3AgMu1Q==", + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/safe-array-concat/-/safe-array-concat-1.1.3.tgz", + "integrity": "sha512-AURm5f0jYEOydBj7VQlVvDrjeFgthDdEF5H1dP+6mNpoXOMo1quQqJ4wvJDyRZ9+pO3kGWoOdmV08cSv2aJV6Q==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.7", - "get-intrinsic": "^1.2.4", - "has-symbols": "^1.0.3", + "call-bind": "^1.0.8", + "call-bound": "^1.0.2", + "get-intrinsic": "^1.2.6", + "has-symbols": "^1.1.0", "isarray": "^2.0.5" }, "engines": { @@ -21526,15 +22741,33 @@ } ] }, + "node_modules/safe-push-apply": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/safe-push-apply/-/safe-push-apply-1.0.0.tgz", + "integrity": "sha512-iKE9w/Z7xCzUMIZqdBsp6pEQvwuEebH4vdpjcDWnyzaI6yl6O9FHvVpmGelvEHNsoY6wGblkxR6Zty/h00WiSA==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "isarray": "^2.0.5" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/safe-regex-test": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/safe-regex-test/-/safe-regex-test-1.0.3.tgz", - "integrity": "sha512-CdASjNJPvRa7roO6Ra/gLYBTzYzzPyyBXxIMdGW3USQLyjWEls2RgW5UBTXaQVp+OrpeCK3bLem8smtmheoRuw==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/safe-regex-test/-/safe-regex-test-1.1.0.tgz", + "integrity": "sha512-x/+Cz4YrimQxQccJf5mKEbIa1NzeCRNI5Ecl/ekmlYaampdNLPalVyIcCZNNH3MvmqBugV5TMYZXv0ljslUlaw==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.6", + "call-bound": "^1.0.2", "es-errors": "^1.3.0", - "is-regex": "^1.1.4" + "is-regex": "^1.2.1" }, "engines": { "node": ">= 0.4" @@ -21868,6 +23101,7 @@ "resolved": "https://registry.npmjs.org/set-function-name/-/set-function-name-2.0.2.tgz", "integrity": "sha512-7PGFlmtwsEADb0WYyvCMa1t+yke6daIG4Wirafur5kcf+MhUnPms1UeR0CKQdTZD81yESwMHbtn+TR+dMviakQ==", "dev": true, + "license": "MIT", "dependencies": { "define-data-property": "^1.1.4", "es-errors": "^1.3.0", @@ -21878,6 +23112,21 @@ "node": ">= 0.4" } }, + "node_modules/set-proto": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/set-proto/-/set-proto-1.0.0.tgz", + "integrity": "sha512-RJRdvCo6IAnPdsvP/7m6bsQqNnn1FCBX5ZNtFL98MmFF/4xAIJTIg1YbHW5DC2W5SKZanrC6i4HsJqlajw/dZw==", + "dev": true, + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/setprototypeof": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz", @@ -22048,15 +23297,73 @@ "dev": true }, "node_modules/side-channel": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.6.tgz", - "integrity": "sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.1.0.tgz", + "integrity": "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.7", "es-errors": "^1.3.0", - "get-intrinsic": "^1.2.4", - "object-inspect": "^1.13.1" + "object-inspect": "^1.13.3", + "side-channel-list": "^1.0.0", + "side-channel-map": "^1.0.1", + "side-channel-weakmap": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel-list": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz", + "integrity": "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "object-inspect": "^1.13.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel-map": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/side-channel-map/-/side-channel-map-1.0.1.tgz", + "integrity": "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.5", + "object-inspect": "^1.13.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel-weakmap": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/side-channel-weakmap/-/side-channel-weakmap-1.0.2.tgz", + "integrity": "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.5", + "object-inspect": "^1.13.3", + "side-channel-map": "^1.0.1" }, "engines": { "node": ">= 0.4" @@ -22232,6 +23539,16 @@ "integrity": "sha512-ji9qxRnOVfcuLDySj9qzhGSEFVobyt1kIOSkj1qZzYLzq7Tos/oUUWvotUPQLlrsidqsK6tBH89Bc9kL5zHA6w==", "deprecated": "Modern JS already guarantees Array#sort() is a stable sort, so this library is deprecated. See the compatibility table on MDN: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/sort#browser_compatibility" }, + "node_modules/stable-hash-x": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/stable-hash-x/-/stable-hash-x-0.2.0.tgz", + "integrity": "sha512-o3yWv49B/o4QZk5ZcsALc6t0+eCelPc44zZsLtCQnZPDwFpDYSWcDnrv2TtMmMbQ7uKo3J0HTURCqckw23czNQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12.0.0" + } + }, "node_modules/stack-utils": { "version": "2.0.6", "resolved": "https://registry.npmjs.org/stack-utils/-/stack-utils-2.0.6.tgz", @@ -22268,6 +23585,20 @@ "node": ">= 0.8" } }, + "node_modules/stop-iteration-iterator": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/stop-iteration-iterator/-/stop-iteration-iterator-1.1.0.tgz", + "integrity": "sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "internal-slot": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/string_decoder": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", @@ -22314,16 +23645,59 @@ "node": ">=8" } }, + "node_modules/string.prototype.matchall": { + "version": "4.0.12", + "resolved": "https://registry.npmjs.org/string.prototype.matchall/-/string.prototype.matchall-4.0.12.tgz", + "integrity": "sha512-6CC9uyBL+/48dYizRf7H7VAYCMCNTBeM78x/VTUe9bFEaxBepPJDa1Ow99LqI/1yF7kuy7Q3cQsYMrcjGUcskA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.6", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0", + "get-intrinsic": "^1.2.6", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "internal-slot": "^1.1.0", + "regexp.prototype.flags": "^1.5.3", + "set-function-name": "^2.0.2", + "side-channel": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/string.prototype.repeat": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/string.prototype.repeat/-/string.prototype.repeat-1.0.0.tgz", + "integrity": "sha512-0u/TldDbKD8bFCQ/4f5+mNRrXwZ8hg2w7ZR8wa16e8z9XpePWl3eGEcUD0OXpEH/VJH/2G3gjUtR3ZOiBe2S/w==", + "dev": true, + "license": "MIT", + "dependencies": { + "define-properties": "^1.1.3", + "es-abstract": "^1.17.5" + } + }, "node_modules/string.prototype.trim": { - "version": "1.2.9", - "resolved": "https://registry.npmjs.org/string.prototype.trim/-/string.prototype.trim-1.2.9.tgz", - "integrity": "sha512-klHuCNxiMZ8MlsOihJhJEBJAiMVqU3Z2nEXWfWnIqjN0gEFS9J9+IxKozWWtQGcgoa1WUZzLjKPTr4ZHNFTFxw==", + "version": "1.2.10", + "resolved": "https://registry.npmjs.org/string.prototype.trim/-/string.prototype.trim-1.2.10.tgz", + "integrity": "sha512-Rs66F0P/1kedk5lyYyH9uBzuiI/kNRmwJAR9quK6VOtIpZ2G+hMZd+HQbbv25MgCA6gEffoMZYxlTod4WcdrKA==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.7", + "call-bind": "^1.0.8", + "call-bound": "^1.0.2", + "define-data-property": "^1.1.4", "define-properties": "^1.2.1", - "es-abstract": "^1.23.0", - "es-object-atoms": "^1.0.0" + "es-abstract": "^1.23.5", + "es-object-atoms": "^1.0.0", + "has-property-descriptors": "^1.0.2" }, "engines": { "node": ">= 0.4" @@ -22333,15 +23707,20 @@ } }, "node_modules/string.prototype.trimend": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/string.prototype.trimend/-/string.prototype.trimend-1.0.8.tgz", - "integrity": "sha512-p73uL5VCHCO2BZZ6krwwQE3kCzM7NKmis8S//xEC6fQonchbum4eP6kR4DLEjQFO3Wnj3Fuo8NM0kOSjVdHjZQ==", + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/string.prototype.trimend/-/string.prototype.trimend-1.0.9.tgz", + "integrity": "sha512-G7Ok5C6E/j4SGfyLCloXTrngQIQU3PWtXGst3yM7Bea9FRURf1S42ZHlZZtsNque2FN2PoUhfZXYLNWwEr4dLQ==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.7", + "call-bind": "^1.0.8", + "call-bound": "^1.0.2", "define-properties": "^1.2.1", "es-object-atoms": "^1.0.0" }, + "engines": { + "node": ">= 0.4" + }, "funding": { "url": "https://github.com/sponsors/ljharb" } @@ -23025,6 +24404,42 @@ } } }, + "node_modules/tsconfig-paths": { + "version": "3.15.0", + "resolved": "https://registry.npmjs.org/tsconfig-paths/-/tsconfig-paths-3.15.0.tgz", + "integrity": "sha512-2Ac2RgzDe/cn48GvOe3M+o82pEFewD3UPbyoUHHdKasHwJKjds4fLXWf/Ux5kATBKN20oaFGu+jbElp1pos0mg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/json5": "^0.0.29", + "json5": "^1.0.2", + "minimist": "^1.2.6", + "strip-bom": "^3.0.0" + } + }, + "node_modules/tsconfig-paths/node_modules/json5": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/json5/-/json5-1.0.2.tgz", + "integrity": "sha512-g1MWMLBiz8FKi1e4w0UyVL3w+iJceWAFBAaBnnGKOpNa5f8TLktkbre1+s6oICydWAm+HRUGTmI+//xv2hvXYA==", + "dev": true, + "license": "MIT", + "dependencies": { + "minimist": "^1.2.0" + }, + "bin": { + "json5": "lib/cli.js" + } + }, + "node_modules/tsconfig-paths/node_modules/strip-bom": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", + "integrity": "sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, "node_modules/tslib": { "version": "2.8.0", "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.0.tgz", @@ -23077,30 +24492,32 @@ } }, "node_modules/typed-array-buffer": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.2.tgz", - "integrity": "sha512-gEymJYKZtKXzzBzM4jqa9w6Q1Jjm7x2d+sh19AdsD4wqnMPDYyvwpsIc2Q/835kHuo3BEQ7CjelGhfTsoBb2MQ==", + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.3.tgz", + "integrity": "sha512-nAYYwfY3qnzX30IkA6AQZjVbtK6duGontcQm1WSG1MD94YLqK0515GNApXkoxKOWMusVssAHWLh9SeaoefYFGw==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.7", + "call-bound": "^1.0.3", "es-errors": "^1.3.0", - "is-typed-array": "^1.1.13" + "is-typed-array": "^1.1.14" }, "engines": { "node": ">= 0.4" } }, "node_modules/typed-array-byte-length": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/typed-array-byte-length/-/typed-array-byte-length-1.0.1.tgz", - "integrity": "sha512-3iMJ9q0ao7WE9tWcaYKIptkNBuOIcZCCT0d4MRvuuH88fEoEH62IuQe0OtraD3ebQEoTRk8XCBoknUNc1Y67pw==", + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/typed-array-byte-length/-/typed-array-byte-length-1.0.3.tgz", + "integrity": "sha512-BaXgOuIxz8n8pIq3e7Atg/7s+DpiYrxn4vdot3w9KbnBhcRQq6o3xemQdIfynqSeXeDrF32x+WvfzmOjPiY9lg==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.7", + "call-bind": "^1.0.8", "for-each": "^0.3.3", - "gopd": "^1.0.1", - "has-proto": "^1.0.3", - "is-typed-array": "^1.1.13" + "gopd": "^1.2.0", + "has-proto": "^1.2.0", + "is-typed-array": "^1.1.14" }, "engines": { "node": ">= 0.4" @@ -23110,17 +24527,19 @@ } }, "node_modules/typed-array-byte-offset": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/typed-array-byte-offset/-/typed-array-byte-offset-1.0.2.tgz", - "integrity": "sha512-Ous0vodHa56FviZucS2E63zkgtgrACj7omjwd/8lTEMEPFFyjfixMZ1ZXenpgCFBBt4EC1J2XsyVS2gkG0eTFA==", + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/typed-array-byte-offset/-/typed-array-byte-offset-1.0.4.tgz", + "integrity": "sha512-bTlAFB/FBYMcuX81gbL4OcpH5PmlFHqlCCpAl8AlEzMz5k53oNDvN8p1PNOWLEmI2x4orp3raOFB51tv9X+MFQ==", "dev": true, + "license": "MIT", "dependencies": { "available-typed-arrays": "^1.0.7", - "call-bind": "^1.0.7", + "call-bind": "^1.0.8", "for-each": "^0.3.3", - "gopd": "^1.0.1", - "has-proto": "^1.0.3", - "is-typed-array": "^1.1.13" + "gopd": "^1.2.0", + "has-proto": "^1.2.0", + "is-typed-array": "^1.1.15", + "reflect.getprototypeof": "^1.0.9" }, "engines": { "node": ">= 0.4" @@ -23130,17 +24549,18 @@ } }, "node_modules/typed-array-length": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/typed-array-length/-/typed-array-length-1.0.6.tgz", - "integrity": "sha512-/OxDN6OtAk5KBpGb28T+HZc2M+ADtvRxXrKKbUwtsLgdoxgX13hyy7ek6bFRl5+aBs2yZzB0c4CnQfAtVypW/g==", + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/typed-array-length/-/typed-array-length-1.0.7.tgz", + "integrity": "sha512-3KS2b+kL7fsuk/eJZ7EQdnEmQoaho/r6KUef7hxvltNA5DR8NAUM+8wJMbJyZ4G9/7i3v5zPBIMN5aybAh2/Jg==", "dev": true, + "license": "MIT", "dependencies": { "call-bind": "^1.0.7", "for-each": "^0.3.3", "gopd": "^1.0.1", - "has-proto": "^1.0.3", "is-typed-array": "^1.1.13", - "possible-typed-array-names": "^1.0.0" + "possible-typed-array-names": "^1.0.0", + "reflect.getprototypeof": "^1.0.6" }, "engines": { "node": ">= 0.4" @@ -23163,15 +24583,19 @@ } }, "node_modules/unbox-primitive": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.0.2.tgz", - "integrity": "sha512-61pPlCD9h51VoreyJ0BReideM3MDKMKnh6+V9L08331ipq6Q8OFXZYiqP6n/tbHx4s5I9uRhcye6BrbkizkBDw==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.1.0.tgz", + "integrity": "sha512-nWJ91DjeOkej/TA8pXQ3myruKpKEYgqvpw9lz4OPHj/NWFNluYrjbz9j01CJ8yKQd2g4jFoOkINCTW2I5LEEyw==", "dev": true, + "license": "MIT", "dependencies": { - "call-bind": "^1.0.2", + "call-bound": "^1.0.3", "has-bigints": "^1.0.2", - "has-symbols": "^1.0.3", - "which-boxed-primitive": "^1.0.2" + "has-symbols": "^1.1.0", + "which-boxed-primitive": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" @@ -23249,6 +24673,41 @@ "node": ">= 0.8" } }, + "node_modules/unrs-resolver": { + "version": "1.11.1", + "resolved": "https://registry.npmjs.org/unrs-resolver/-/unrs-resolver-1.11.1.tgz", + "integrity": "sha512-bSjt9pjaEBnNiGgc9rUiHGKv5l4/TGzDmYw3RhnkJGtLhbnnA/5qJj7x3dNDCRx/PJxu774LlH8lCOlB4hEfKg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "napi-postinstall": "^0.3.0" + }, + "funding": { + "url": "https://opencollective.com/unrs-resolver" + }, + "optionalDependencies": { + "@unrs/resolver-binding-android-arm-eabi": "1.11.1", + "@unrs/resolver-binding-android-arm64": "1.11.1", + "@unrs/resolver-binding-darwin-arm64": "1.11.1", + "@unrs/resolver-binding-darwin-x64": "1.11.1", + "@unrs/resolver-binding-freebsd-x64": "1.11.1", + "@unrs/resolver-binding-linux-arm-gnueabihf": "1.11.1", + "@unrs/resolver-binding-linux-arm-musleabihf": "1.11.1", + "@unrs/resolver-binding-linux-arm64-gnu": "1.11.1", + "@unrs/resolver-binding-linux-arm64-musl": "1.11.1", + "@unrs/resolver-binding-linux-ppc64-gnu": "1.11.1", + "@unrs/resolver-binding-linux-riscv64-gnu": "1.11.1", + "@unrs/resolver-binding-linux-riscv64-musl": "1.11.1", + "@unrs/resolver-binding-linux-s390x-gnu": "1.11.1", + "@unrs/resolver-binding-linux-x64-gnu": "1.11.1", + "@unrs/resolver-binding-linux-x64-musl": "1.11.1", + "@unrs/resolver-binding-wasm32-wasi": "1.11.1", + "@unrs/resolver-binding-win32-arm64-msvc": "1.11.1", + "@unrs/resolver-binding-win32-ia32-msvc": "1.11.1", + "@unrs/resolver-binding-win32-x64-msvc": "1.11.1" + } + }, "node_modules/update-browserslist-db": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.1.tgz", @@ -23849,31 +25308,85 @@ } }, "node_modules/which-boxed-primitive": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/which-boxed-primitive/-/which-boxed-primitive-1.1.1.tgz", + "integrity": "sha512-TbX3mj8n0odCBFVlY8AxkqcHASw3L60jIuF8jFP78az3C2YhmGvqbHBpAjTRH2/xqYunrJ9g1jSyjCjpoWzIAA==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-bigint": "^1.1.0", + "is-boolean-object": "^1.2.1", + "is-number-object": "^1.1.1", + "is-string": "^1.1.1", + "is-symbol": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/which-builtin-type": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/which-builtin-type/-/which-builtin-type-1.2.1.tgz", + "integrity": "sha512-6iBczoX+kDQ7a3+YJBnh3T+KZRxM/iYNPXicqk66/Qfm1b93iu+yOImkg0zHbj5LNOcNv1TEADiZ0xa34B4q6Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "function.prototype.name": "^1.1.6", + "has-tostringtag": "^1.0.2", + "is-async-function": "^2.0.0", + "is-date-object": "^1.1.0", + "is-finalizationregistry": "^1.1.0", + "is-generator-function": "^1.0.10", + "is-regex": "^1.2.1", + "is-weakref": "^1.0.2", + "isarray": "^2.0.5", + "which-boxed-primitive": "^1.1.0", + "which-collection": "^1.0.2", + "which-typed-array": "^1.1.16" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/which-collection": { "version": "1.0.2", - "resolved": "https://registry.npmjs.org/which-boxed-primitive/-/which-boxed-primitive-1.0.2.tgz", - "integrity": "sha512-bwZdv0AKLpplFY2KZRX6TvyuN7ojjr7lwkg6ml0roIy9YeuSr7JS372qlNW18UQYzgYK9ziGcerWqZOmEn9VNg==", + "resolved": "https://registry.npmjs.org/which-collection/-/which-collection-1.0.2.tgz", + "integrity": "sha512-K4jVyjnBdgvc86Y6BkaLZEN933SwYOuBFkdmBu9ZfkcAbdVbpITnDmjvZ/aQjRXQrv5EPkTnD1s39GiiqbngCw==", "dev": true, + "license": "MIT", "dependencies": { - "is-bigint": "^1.0.1", - "is-boolean-object": "^1.1.0", - "is-number-object": "^1.0.4", - "is-string": "^1.0.5", - "is-symbol": "^1.0.3" + "is-map": "^2.0.3", + "is-set": "^2.0.3", + "is-weakmap": "^2.0.2", + "is-weakset": "^2.0.3" + }, + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" } }, "node_modules/which-typed-array": { - "version": "1.1.15", - "resolved": "https://registry.npmjs.org/which-typed-array/-/which-typed-array-1.1.15.tgz", - "integrity": "sha512-oV0jmFtUky6CXfkqehVvBP/LSWJ2sy4vWMioiENyJLePrBO/yKyV9OyJySfAKosh+RYkIl5zJCNZ8/4JncrpdA==", + "version": "1.1.20", + "resolved": "https://registry.npmjs.org/which-typed-array/-/which-typed-array-1.1.20.tgz", + "integrity": "sha512-LYfpUkmqwl0h9A2HL09Mms427Q1RZWuOHsukfVcKRq9q95iQxdw0ix1JQrqbcDR9PH1QDwf5Qo8OZb5lksZ8Xg==", "dev": true, + "license": "MIT", "dependencies": { "available-typed-arrays": "^1.0.7", - "call-bind": "^1.0.7", - "for-each": "^0.3.3", - "gopd": "^1.0.1", + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "for-each": "^0.3.5", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", "has-tostringtag": "^1.0.2" }, "engines": { diff --git a/frontend/package.json b/frontend/package.json index 9d581114df..f2ef0d9ca1 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -59,10 +59,11 @@ "cross-env": "^7.0.3", "css-loader": "^6.7.3", "enzyme": "^3.11.0", - "eslint": "^9.32.0", + "eslint": "^9.39.2", "eslint-config-prettier": "^10.1.5", "eslint-plugin-i18n": "^2.4.0", "eslint-plugin-prettier": "^5.4.1", + "eslint-plugin-react": "^7.37.5", "eslint-plugin-simple-import-sort": "^12.1.1", "favicons": "^7.2.0", "favicons-webpack-plugin": "^6.0.1", diff --git a/frontend/src/components/Code/index.tsx b/frontend/src/components/Code/index.tsx index 180d4586a7..db16c4fcb1 100644 --- a/frontend/src/components/Code/index.tsx +++ b/frontend/src/components/Code/index.tsx @@ -17,3 +17,5 @@ export const Code = forwardRef(({ children, className }, ); }); + +Code.displayName = 'Code'; From 28797f65e322aa316ce6b0adff815925d45abf96 Mon Sep 17 00:00:00 2001 From: Oleg Vavilov Date: Thu, 22 Jan 2026 16:58:56 +0300 Subject: [PATCH 070/187] Fix CI errors --- frontend/package-lock.json | 1677 ++++++++++++++---------------------- 1 file changed, 626 insertions(+), 1051 deletions(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index a3d3f29d93..445c61daa9 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -84,8 +84,6 @@ "eslint": "^9.39.2", "eslint-config-prettier": "^10.1.5", "eslint-plugin-i18n": "^2.4.0", - "eslint-plugin-import": "^2.32.0", - "eslint-plugin-import-x": "^4.16.1", "eslint-plugin-prettier": "^5.4.1", "eslint-plugin-react": "^7.37.5", "eslint-plugin-simple-import-sort": "^12.1.1", @@ -163,9 +161,9 @@ } }, "node_modules/@apidevtools/json-schema-ref-parser/node_modules/js-yaml": { - "version": "3.14.1", - "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.14.1.tgz", - "integrity": "sha512-okMH7OXXJ7YrN9Ok3/SXrnu4iX9yOk+25nqX4imS2npuvTYDmo/QEZoqwZkYaIDk3jVvBOTOIEgEhaLOynBS9g==", + "version": "3.14.2", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.14.2.tgz", + "integrity": "sha512-PMSmkqxr106Xa156c2M265Z+FTrPl+oxd/rgOQy2tijQeK5TxQ43psO1ZCwhVOSdnn+RzkzlRz/eY4BgJBYVpg==", "dev": true, "license": "MIT", "dependencies": { @@ -239,14 +237,15 @@ } }, "node_modules/@babel/code-frame": { - "version": "7.26.0", - "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.26.0.tgz", - "integrity": "sha512-INCKxTtbXtcNbUZ3YXutwMpEleqttcswhAdee7dhuoVrD2cnuc3PqtERBtxkX5nziX9vnBL8WXmSGwv8CuPV6g==", + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.28.6.tgz", + "integrity": "sha512-JYgintcMjRiCvS8mMECzaEn+m3PfoQiyqukOMCCVQtoJGYJw8j/8LBJEiqkHLkfwCcs74E3pbAUFNg7d9VNJ+Q==", "dev": true, + "license": "MIT", "dependencies": { - "@babel/helper-validator-identifier": "^7.25.9", + "@babel/helper-validator-identifier": "^7.28.5", "js-tokens": "^4.0.0", - "picocolors": "^1.0.0" + "picocolors": "^1.1.1" }, "engines": { "node": ">=6.9.0" @@ -527,19 +526,21 @@ } }, "node_modules/@babel/helper-string-parser": { - "version": "7.25.9", - "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.25.9.tgz", - "integrity": "sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==", + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz", + "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==", "dev": true, + "license": "MIT", "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/helper-validator-identifier": { - "version": "7.25.9", - "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.25.9.tgz", - "integrity": "sha512-Ed61U6XJc3CVRfkERJWDz4dJwKe7iLmmJsbOGu9wSloNSFttHV0I8g6UAgb7qnK5ly5bGLPd4oXZlxCdANBOWQ==", + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz", + "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==", "dev": true, + "license": "MIT", "engines": { "node": ">=6.9.0" } @@ -568,25 +569,27 @@ } }, "node_modules/@babel/helpers": { - "version": "7.26.0", - "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.26.0.tgz", - "integrity": "sha512-tbhNuIxNcVb21pInl3ZSjksLCvgdZy9KwJ8brv993QtIVKJBBkYXz4q4ZbAv31GdnC+R90np23L5FbEBlthAEw==", + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.28.6.tgz", + "integrity": "sha512-xOBvwq86HHdB7WUDTfKfT/Vuxh7gElQ+Sfti2Cy6yIWNW05P8iUslOVcZ4/sKbE+/jQaukQAdz/gf3724kYdqw==", "dev": true, + "license": "MIT", "dependencies": { - "@babel/template": "^7.25.9", - "@babel/types": "^7.26.0" + "@babel/template": "^7.28.6", + "@babel/types": "^7.28.6" }, "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/parser": { - "version": "7.26.1", - "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.26.1.tgz", - "integrity": "sha512-reoQYNiAJreZNsJzyrDNzFQ+IQ5JFiIzAHJg9bn94S3l+4++J7RsIhNMoB+lgP/9tpmiAQqspv+xfdxTSzREOw==", + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.28.6.tgz", + "integrity": "sha512-TeR9zWR18BvbfPmGbLampPMW+uW1NZnJlRuuHso8i87QZNq2JRF9i6RgxRqtEq+wQGsS19NNTWr2duhnE49mfQ==", "dev": true, + "license": "MIT", "dependencies": { - "@babel/types": "^7.26.0" + "@babel/types": "^7.28.6" }, "bin": { "parser": "bin/babel-parser.js" @@ -2020,25 +2023,24 @@ } }, "node_modules/@babel/runtime": { - "version": "7.26.0", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.26.0.tgz", - "integrity": "sha512-FDSOghenHTiToteC/QRlv2q3DhPZ/oOXTBoirfWNx1Cx3TMVcGWQtMMmQcSvb/JjpNeGzx8Pq/b4fKEJuWm1sw==", - "dependencies": { - "regenerator-runtime": "^0.14.0" - }, + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.28.6.tgz", + "integrity": "sha512-05WQkdpL9COIMz4LjTxGpPNCdlpyimKppYNoJ5Di5EUObifl8t4tuLuUBBZEpoLYOmfvIWrsp9fCl0HoPRVTdA==", + "license": "MIT", "engines": { "node": ">=6.9.0" } }, "node_modules/@babel/template": { - "version": "7.25.9", - "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.25.9.tgz", - "integrity": "sha512-9DGttpmPvIxBb/2uwpVo3dqJ+O6RooAFOS+lB+xDqoE2PVCE8nfoHMdZLpfCQRLwvohzXISPZcgxt80xLfsuwg==", + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.28.6.tgz", + "integrity": "sha512-YA6Ma2KsCdGb+WC6UpBVFJGXL58MDA6oyONbjyF/+5sBgxY/dwkhLogbMT2GXXyU84/IhRw/2D1Os1B/giz+BQ==", "dev": true, + "license": "MIT", "dependencies": { - "@babel/code-frame": "^7.25.9", - "@babel/parser": "^7.25.9", - "@babel/types": "^7.25.9" + "@babel/code-frame": "^7.28.6", + "@babel/parser": "^7.28.6", + "@babel/types": "^7.28.6" }, "engines": { "node": ">=6.9.0" @@ -2063,13 +2065,14 @@ } }, "node_modules/@babel/types": { - "version": "7.26.0", - "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.26.0.tgz", - "integrity": "sha512-Z/yiTPj+lDVnF7lWeKCIJzaIkI0vYO87dMpZ4bg4TDrFe4XXLFWL1TbXU27gBP3QccxV9mZICCrnjnYlJjXHOA==", + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.28.6.tgz", + "integrity": "sha512-0ZrskXVEHSWIqZM/sQZ4EV3jZJXRkio/WCxaqKZP1g//CEWEPSfeZFcms4XeKBCHU0ZKnIkdJeU/kF+eRp5lBg==", "dev": true, + "license": "MIT", "dependencies": { - "@babel/helper-string-parser": "^7.25.9", - "@babel/helper-validator-identifier": "^7.25.9" + "@babel/helper-string-parser": "^7.27.1", + "@babel/helper-validator-identifier": "^7.28.5" }, "engines": { "node": ">=6.9.0" @@ -2551,18 +2554,6 @@ "react": ">=16.8.0" } }, - "node_modules/@emnapi/core": { - "version": "1.8.1", - "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.8.1.tgz", - "integrity": "sha512-AvT9QFpxK0Zd8J0jopedNm+w/2fIzvtPKPjqyw9jwvBaReTTqPBk9Hixaz7KbjimP+QNz605/XnjFcDAL2pqBg==", - "dev": true, - "license": "MIT", - "optional": true, - "dependencies": { - "@emnapi/wasi-threads": "1.1.0", - "tslib": "^2.4.0" - } - }, "node_modules/@emnapi/runtime": { "version": "1.8.1", "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.8.1.tgz", @@ -2574,17 +2565,6 @@ "tslib": "^2.4.0" } }, - "node_modules/@emnapi/wasi-threads": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.1.0.tgz", - "integrity": "sha512-WI0DdZ8xFSbgMjR1sFsKABJ/C5OnRrjT06JXbZKexJGrDuPTzZdDYfFlsgcCXCyf+suG5QU2e/y1Wo2V/OapLQ==", - "dev": true, - "license": "MIT", - "optional": true, - "dependencies": { - "tslib": "^2.4.0" - } - }, "node_modules/@emotion/is-prop-valid": { "version": "1.2.2", "resolved": "https://registry.npmjs.org/@emotion/is-prop-valid/-/is-prop-valid-1.2.2.tgz", @@ -3261,29 +3241,6 @@ "url": "https://opencollective.com/libvips" } }, - "node_modules/@isaacs/balanced-match": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/@isaacs/balanced-match/-/balanced-match-4.0.1.tgz", - "integrity": "sha512-yzMTt9lEb8Gv7zRioUilSglI0c0smZ9k5D65677DLWLtWJaXIS3CqcGyUFByYKlnUj6TkjLVs54fBl6+TiGQDQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": "20 || >=22" - } - }, - "node_modules/@isaacs/brace-expansion": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/@isaacs/brace-expansion/-/brace-expansion-5.0.0.tgz", - "integrity": "sha512-ZT55BDLV0yv0RBm2czMiZ+SqCGO7AvmOM3G/w2xhVPH+te0aKgFjmBvGlL1dH+ql2tgGO3MVrbb3jCKyvpgnxA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@isaacs/balanced-match": "^4.0.1" - }, - "engines": { - "node": "20 || >=22" - } - }, "node_modules/@istanbuljs/load-nyc-config": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@istanbuljs/load-nyc-config/-/load-nyc-config-1.1.0.tgz", @@ -3333,9 +3290,9 @@ } }, "node_modules/@istanbuljs/load-nyc-config/node_modules/js-yaml": { - "version": "3.14.1", - "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.14.1.tgz", - "integrity": "sha512-okMH7OXXJ7YrN9Ok3/SXrnu4iX9yOk+25nqX4imS2npuvTYDmo/QEZoqwZkYaIDk3jVvBOTOIEgEhaLOynBS9g==", + "version": "3.14.2", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.14.2.tgz", + "integrity": "sha512-PMSmkqxr106Xa156c2M265Z+FTrPl+oxd/rgOQy2tijQeK5TxQ43psO1ZCwhVOSdnn+RzkzlRz/eY4BgJBYVpg==", "dev": true, "license": "MIT", "dependencies": { @@ -3873,19 +3830,6 @@ "integrity": "sha512-Vo+PSpZG2/fmgmiNzYK9qWRh8h/CHrwD0mo1h1DzL4yzHNSfWYujGTYsWGreD000gcgmZ7K4Ys6Tx9TxtsKdDw==", "dev": true }, - "node_modules/@napi-rs/wasm-runtime": { - "version": "0.2.12", - "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-0.2.12.tgz", - "integrity": "sha512-ZVWUcfwY4E/yPitQJl481FjFo3K22D6qF0DuFH6Y/nbnE11GY5uguDxZMGXPQ8WQ0128MXQD7TnfHyK4oWoIJQ==", - "dev": true, - "license": "MIT", - "optional": true, - "dependencies": { - "@emnapi/core": "^1.4.3", - "@emnapi/runtime": "^1.4.3", - "@tybys/wasm-util": "^0.10.0" - } - }, "node_modules/@nicolo-ribaudo/chokidar-2": { "version": "2.1.8-no-fsevents.3", "resolved": "https://registry.npmjs.org/@nicolo-ribaudo/chokidar-2/-/chokidar-2-2.1.8-no-fsevents.3.tgz", @@ -3893,6 +3837,19 @@ "dev": true, "optional": true }, + "node_modules/@noble/hashes": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/@noble/hashes/-/hashes-1.4.0.tgz", + "integrity": "sha512-V1JJ1WTRUqHHrOSh597hURcMqVKVGL/ea3kv0gSnEdsEZ0/+VyPghM1lMNGc00z7CIQorSvbKpuJkxvuHbvdbg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 16" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + } + }, "node_modules/@nodelib/fs.scandir": { "version": "2.1.5", "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", @@ -4209,6 +4166,165 @@ "url": "https://opencollective.com/parcel" } }, + "node_modules/@peculiar/asn1-cms": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-cms/-/asn1-cms-2.6.0.tgz", + "integrity": "sha512-2uZqP+ggSncESeUF/9Su8rWqGclEfEiz1SyU02WX5fUONFfkjzS2Z/F1Li0ofSmf4JqYXIOdCAZqIXAIBAT1OA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "@peculiar/asn1-x509-attr": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-csr": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-csr/-/asn1-csr-2.6.0.tgz", + "integrity": "sha512-BeWIu5VpTIhfRysfEp73SGbwjjoLL/JWXhJ/9mo4vXnz3tRGm+NGm3KNcRzQ9VMVqwYS2RHlolz21svzRXIHPQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-ecc": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-ecc/-/asn1-ecc-2.6.0.tgz", + "integrity": "sha512-FF3LMGq6SfAOwUG2sKpPXblibn6XnEIKa+SryvUl5Pik+WR9rmRA3OCiwz8R3lVXnYnyRkSZsSLdml8H3UiOcw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-pfx": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-pfx/-/asn1-pfx-2.6.0.tgz", + "integrity": "sha512-rtUvtf+tyKGgokHHmZzeUojRZJYPxoD/jaN1+VAB4kKR7tXrnDCA/RAWXAIhMJJC+7W27IIRGe9djvxKgsldCQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-cms": "^2.6.0", + "@peculiar/asn1-pkcs8": "^2.6.0", + "@peculiar/asn1-rsa": "^2.6.0", + "@peculiar/asn1-schema": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-pkcs8": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-pkcs8/-/asn1-pkcs8-2.6.0.tgz", + "integrity": "sha512-KyQ4D8G/NrS7Fw3XCJrngxmjwO/3htnA0lL9gDICvEQ+GJ+EPFqldcJQTwPIdvx98Tua+WjkdKHSC0/Km7T+lA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-pkcs9": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-pkcs9/-/asn1-pkcs9-2.6.0.tgz", + "integrity": "sha512-b78OQ6OciW0aqZxdzliXGYHASeCvvw5caqidbpQRYW2mBtXIX2WhofNXTEe7NyxTb0P6J62kAAWLwn0HuMF1Fw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-cms": "^2.6.0", + "@peculiar/asn1-pfx": "^2.6.0", + "@peculiar/asn1-pkcs8": "^2.6.0", + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "@peculiar/asn1-x509-attr": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-rsa": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-rsa/-/asn1-rsa-2.6.0.tgz", + "integrity": "sha512-Nu4C19tsrTsCp9fDrH+sdcOKoVfdfoQQ7S3VqjJU6vedR7tY3RLkQ5oguOIB3zFW33USDUuYZnPEQYySlgha4w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-schema": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-schema/-/asn1-schema-2.6.0.tgz", + "integrity": "sha512-xNLYLBFTBKkCzEZIw842BxytQQATQv+lDTCEMZ8C196iJcJJMBUZxrhSTxLaohMyKK8QlzRNTRkUmanucnDSqg==", + "dev": true, + "license": "MIT", + "dependencies": { + "asn1js": "^3.0.6", + "pvtsutils": "^1.3.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-x509": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-x509/-/asn1-x509-2.6.0.tgz", + "integrity": "sha512-uzYbPEpoQiBoTq0/+jZtpM6Gq6zADBx+JNFP3yqRgziWBxQ/Dt/HcuvRfm9zJTPdRcBqPNdaRHTVwpyiq6iNMA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "asn1js": "^3.0.6", + "pvtsutils": "^1.3.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/asn1-x509-attr": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/@peculiar/asn1-x509-attr/-/asn1-x509-attr-2.6.0.tgz", + "integrity": "sha512-MuIAXFX3/dc8gmoZBkwJWxUWOSvG4MMDntXhrOZpJVMkYX+MYc/rUAU2uJOved9iJEoiUx7//3D8oG83a78UJA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "asn1js": "^3.0.6", + "tslib": "^2.8.1" + } + }, + "node_modules/@peculiar/x509": { + "version": "1.14.3", + "resolved": "https://registry.npmjs.org/@peculiar/x509/-/x509-1.14.3.tgz", + "integrity": "sha512-C2Xj8FZ0uHWeCXXqX5B4/gVFQmtSkiuOolzAgutjTfseNOHT3pUjljDZsTSxXFGgio54bCzVFqmEOUrIVk8RDA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@peculiar/asn1-cms": "^2.6.0", + "@peculiar/asn1-csr": "^2.6.0", + "@peculiar/asn1-ecc": "^2.6.0", + "@peculiar/asn1-pkcs9": "^2.6.0", + "@peculiar/asn1-rsa": "^2.6.0", + "@peculiar/asn1-schema": "^2.6.0", + "@peculiar/asn1-x509": "^2.6.0", + "pvtsutils": "^1.3.6", + "reflect-metadata": "^0.2.2", + "tslib": "^2.8.1", + "tsyringe": "^4.10.0" + }, + "engines": { + "node": ">=20.0.0" + } + }, "node_modules/@pkgr/core": { "version": "0.2.7", "resolved": "https://registry.npmjs.org/@pkgr/core/-/core-0.2.7.tgz", @@ -4324,9 +4440,10 @@ } }, "node_modules/@remix-run/router": { - "version": "1.20.0", - "resolved": "https://registry.npmjs.org/@remix-run/router/-/router-1.20.0.tgz", - "integrity": "sha512-mUnk8rPJBI9loFDZ+YzPGdeniYK+FTmRD1TMCz7ev2SNIozyKKpnGgsxO34u6Z4z/t0ITuu7voi/AshfsGsgFg==", + "version": "1.23.2", + "resolved": "https://registry.npmjs.org/@remix-run/router/-/router-1.23.2.tgz", + "integrity": "sha512-Ic6m2U/rMjTkhERIa/0ZtXJP17QUi2CbWE7cqx4J58M8aA3QTfW+2UlQ4psvTX9IO1RfNVhK3pcpdjej7L+t2w==", + "license": "MIT", "engines": { "node": ">=14.0.0" } @@ -4362,13 +4479,6 @@ "node": ">=10" } }, - "node_modules/@rtsao/scc": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@rtsao/scc/-/scc-1.1.0.tgz", - "integrity": "sha512-zt6OdqaDoOnJ1ZYsCYGt9YmWzDXl4vQdKTyJev62gFhRGKdx7mcT54V9KIjg+d2wi9EXsPvAPKe7i7WjfVWB8g==", - "dev": true, - "license": "MIT" - }, "node_modules/@sinclair/typebox": { "version": "0.27.8", "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz", @@ -4780,17 +4890,6 @@ "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==", "dev": true }, - "node_modules/@tybys/wasm-util": { - "version": "0.10.1", - "resolved": "https://registry.npmjs.org/@tybys/wasm-util/-/wasm-util-0.10.1.tgz", - "integrity": "sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg==", - "dev": true, - "license": "MIT", - "optional": true, - "dependencies": { - "tslib": "^2.4.0" - } - }, "node_modules/@types/aria-query": { "version": "5.0.4", "resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz", @@ -4951,15 +5050,16 @@ "integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==" }, "node_modules/@types/express": { - "version": "4.17.21", - "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz", - "integrity": "sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==", + "version": "4.17.25", + "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.25.tgz", + "integrity": "sha512-dVd04UKsfpINUnK0yBoYHDF3xu7xVH4BuDotC/xGuycx4CgbP48X/KF/586bcObxT0HENHXEU8Nqtu6NR+eKhw==", "dev": true, + "license": "MIT", "dependencies": { "@types/body-parser": "*", "@types/express-serve-static-core": "^4.17.33", "@types/qs": "*", - "@types/serve-static": "*" + "@types/serve-static": "^1" } }, "node_modules/@types/express-serve-static-core": { @@ -5100,13 +5200,6 @@ "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz", "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==" }, - "node_modules/@types/json5": { - "version": "0.0.29", - "resolved": "https://registry.npmjs.org/@types/json5/-/json5-0.0.29.tgz", - "integrity": "sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ==", - "dev": true, - "license": "MIT" - }, "node_modules/@types/lodash": { "version": "4.17.13", "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.13.tgz", @@ -5135,15 +5228,6 @@ "form-data": "^4.0.0" } }, - "node_modules/@types/node-forge": { - "version": "1.3.11", - "resolved": "https://registry.npmjs.org/@types/node-forge/-/node-forge-1.3.11.tgz", - "integrity": "sha512-FQx220y22OKNTqaByeBGqHWYz4cl94tpcxeFdvBo3wjG6XPBuZ0BNgNZRV5J5TFmmcsJ4IzsLkmGRiQbnYsBEQ==", - "dev": true, - "dependencies": { - "@types/node": "*" - } - }, "node_modules/@types/parse-json": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/@types/parse-json/-/parse-json-4.0.2.tgz", @@ -5532,9 +5616,9 @@ } }, "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", - "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz", + "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==", "dev": true, "license": "MIT", "dependencies": { @@ -5625,324 +5709,55 @@ "url": "https://opencollective.com/eslint" } }, - "node_modules/@unrs/resolver-binding-android-arm-eabi": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-android-arm-eabi/-/resolver-binding-android-arm-eabi-1.11.1.tgz", - "integrity": "sha512-ppLRUgHVaGRWUx0R0Ut06Mjo9gBaBkg3v/8AxusGLhsIotbBLuRk51rAzqLC8gq6NyyAojEXglNjzf6R948DNw==", - "cpu": [ - "arm" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "android" - ] + "node_modules/@webassemblyjs/ast": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.12.1.tgz", + "integrity": "sha512-EKfMUOPRRUTy5UII4qJDGPpqfwjOmZ5jeGFwid9mnoqIFK+e0vqoi1qH56JpmZSzEL53jKnNzScdmftJyG5xWg==", + "dependencies": { + "@webassemblyjs/helper-numbers": "1.11.6", + "@webassemblyjs/helper-wasm-bytecode": "1.11.6" + } }, - "node_modules/@unrs/resolver-binding-android-arm64": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-android-arm64/-/resolver-binding-android-arm64-1.11.1.tgz", - "integrity": "sha512-lCxkVtb4wp1v+EoN+HjIG9cIIzPkX5OtM03pQYkG+U5O/wL53LC4QbIeazgiKqluGeVEeBlZahHalCaBvU1a2g==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "android" - ] + "node_modules/@webassemblyjs/floating-point-hex-parser": { + "version": "1.11.6", + "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.11.6.tgz", + "integrity": "sha512-ejAj9hfRJ2XMsNHk/v6Fu2dGS+i4UaXBXGemOfQ/JfQ6mdQg/WXtwleQRLLS4OvfDhv8rYnVwH27YJLMyYsxhw==" }, - "node_modules/@unrs/resolver-binding-darwin-arm64": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-darwin-arm64/-/resolver-binding-darwin-arm64-1.11.1.tgz", - "integrity": "sha512-gPVA1UjRu1Y/IsB/dQEsp2V1pm44Of6+LWvbLc9SDk1c2KhhDRDBUkQCYVWe6f26uJb3fOK8saWMgtX8IrMk3g==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ] + "node_modules/@webassemblyjs/helper-api-error": { + "version": "1.11.6", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.11.6.tgz", + "integrity": "sha512-o0YkoP4pVu4rN8aTJgAyj9hC2Sv5UlkzCHhxqWj8butaLvnpdc2jOwh4ewE6CX0txSfLn/UYaV/pheS2Txg//Q==" }, - "node_modules/@unrs/resolver-binding-darwin-x64": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-darwin-x64/-/resolver-binding-darwin-x64-1.11.1.tgz", - "integrity": "sha512-cFzP7rWKd3lZaCsDze07QX1SC24lO8mPty9vdP+YVa3MGdVgPmFc59317b2ioXtgCMKGiCLxJ4HQs62oz6GfRQ==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ] + "node_modules/@webassemblyjs/helper-buffer": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.12.1.tgz", + "integrity": "sha512-nzJwQw99DNDKr9BVCOZcLuJJUlqkJh+kVzVl6Fmq/tI5ZtEyWT1KZMyOXltXLZJmDtvLCDgwsyrkohEtopTXCw==" }, - "node_modules/@unrs/resolver-binding-freebsd-x64": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-freebsd-x64/-/resolver-binding-freebsd-x64-1.11.1.tgz", - "integrity": "sha512-fqtGgak3zX4DCB6PFpsH5+Kmt/8CIi4Bry4rb1ho6Av2QHTREM+47y282Uqiu3ZRF5IQioJQ5qWRV6jduA+iGw==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ] + "node_modules/@webassemblyjs/helper-numbers": { + "version": "1.11.6", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-numbers/-/helper-numbers-1.11.6.tgz", + "integrity": "sha512-vUIhZ8LZoIWHBohiEObxVm6hwP034jwmc9kuq5GdHZH0wiLVLIPcMCdpJzG4C11cHoQ25TFIQj9kaVADVX7N3g==", + "dependencies": { + "@webassemblyjs/floating-point-hex-parser": "1.11.6", + "@webassemblyjs/helper-api-error": "1.11.6", + "@xtuc/long": "4.2.2" + } }, - "node_modules/@unrs/resolver-binding-linux-arm-gnueabihf": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-arm-gnueabihf/-/resolver-binding-linux-arm-gnueabihf-1.11.1.tgz", - "integrity": "sha512-u92mvlcYtp9MRKmP+ZvMmtPN34+/3lMHlyMj7wXJDeXxuM0Vgzz0+PPJNsro1m3IZPYChIkn944wW8TYgGKFHw==", - "cpu": [ - "arm" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] + "node_modules/@webassemblyjs/helper-wasm-bytecode": { + "version": "1.11.6", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.11.6.tgz", + "integrity": "sha512-sFFHKwcmBprO9e7Icf0+gddyWYDViL8bpPjJJl0WHxCdETktXdmtWLGVzoHbqUcY4Be1LkNfwTmXOJUFZYSJdA==" }, - "node_modules/@unrs/resolver-binding-linux-arm-musleabihf": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-arm-musleabihf/-/resolver-binding-linux-arm-musleabihf-1.11.1.tgz", - "integrity": "sha512-cINaoY2z7LVCrfHkIcmvj7osTOtm6VVT16b5oQdS4beibX2SYBwgYLmqhBjA1t51CarSaBuX5YNsWLjsqfW5Cw==", - "cpu": [ - "arm" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@unrs/resolver-binding-linux-arm64-gnu": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-arm64-gnu/-/resolver-binding-linux-arm64-gnu-1.11.1.tgz", - "integrity": "sha512-34gw7PjDGB9JgePJEmhEqBhWvCiiWCuXsL9hYphDF7crW7UgI05gyBAi6MF58uGcMOiOqSJ2ybEeCvHcq0BCmQ==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@unrs/resolver-binding-linux-arm64-musl": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-arm64-musl/-/resolver-binding-linux-arm64-musl-1.11.1.tgz", - "integrity": "sha512-RyMIx6Uf53hhOtJDIamSbTskA99sPHS96wxVE/bJtePJJtpdKGXO1wY90oRdXuYOGOTuqjT8ACccMc4K6QmT3w==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@unrs/resolver-binding-linux-ppc64-gnu": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-ppc64-gnu/-/resolver-binding-linux-ppc64-gnu-1.11.1.tgz", - "integrity": "sha512-D8Vae74A4/a+mZH0FbOkFJL9DSK2R6TFPC9M+jCWYia/q2einCubX10pecpDiTmkJVUH+y8K3BZClycD8nCShA==", - "cpu": [ - "ppc64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@unrs/resolver-binding-linux-riscv64-gnu": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-riscv64-gnu/-/resolver-binding-linux-riscv64-gnu-1.11.1.tgz", - "integrity": "sha512-frxL4OrzOWVVsOc96+V3aqTIQl1O2TjgExV4EKgRY09AJ9leZpEg8Ak9phadbuX0BA4k8U5qtvMSQQGGmaJqcQ==", - "cpu": [ - "riscv64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@unrs/resolver-binding-linux-riscv64-musl": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-riscv64-musl/-/resolver-binding-linux-riscv64-musl-1.11.1.tgz", - "integrity": "sha512-mJ5vuDaIZ+l/acv01sHoXfpnyrNKOk/3aDoEdLO/Xtn9HuZlDD6jKxHlkN8ZhWyLJsRBxfv9GYM2utQ1SChKew==", - "cpu": [ - "riscv64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@unrs/resolver-binding-linux-s390x-gnu": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-s390x-gnu/-/resolver-binding-linux-s390x-gnu-1.11.1.tgz", - "integrity": "sha512-kELo8ebBVtb9sA7rMe1Cph4QHreByhaZ2QEADd9NzIQsYNQpt9UkM9iqr2lhGr5afh885d/cB5QeTXSbZHTYPg==", - "cpu": [ - "s390x" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@unrs/resolver-binding-linux-x64-gnu": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-x64-gnu/-/resolver-binding-linux-x64-gnu-1.11.1.tgz", - "integrity": "sha512-C3ZAHugKgovV5YvAMsxhq0gtXuwESUKc5MhEtjBpLoHPLYM+iuwSj3lflFwK3DPm68660rZ7G8BMcwSro7hD5w==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@unrs/resolver-binding-linux-x64-musl": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-x64-musl/-/resolver-binding-linux-x64-musl-1.11.1.tgz", - "integrity": "sha512-rV0YSoyhK2nZ4vEswT/QwqzqQXw5I6CjoaYMOX0TqBlWhojUf8P94mvI7nuJTeaCkkds3QE4+zS8Ko+GdXuZtA==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@unrs/resolver-binding-wasm32-wasi": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-wasm32-wasi/-/resolver-binding-wasm32-wasi-1.11.1.tgz", - "integrity": "sha512-5u4RkfxJm+Ng7IWgkzi3qrFOvLvQYnPBmjmZQ8+szTK/b31fQCnleNl1GgEt7nIsZRIf5PLhPwT0WM+q45x/UQ==", - "cpu": [ - "wasm32" - ], - "dev": true, - "license": "MIT", - "optional": true, - "dependencies": { - "@napi-rs/wasm-runtime": "^0.2.11" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@unrs/resolver-binding-win32-arm64-msvc": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-arm64-msvc/-/resolver-binding-win32-arm64-msvc-1.11.1.tgz", - "integrity": "sha512-nRcz5Il4ln0kMhfL8S3hLkxI85BXs3o8EYoattsJNdsX4YUU89iOkVn7g0VHSRxFuVMdM4Q1jEpIId1Ihim/Uw==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@unrs/resolver-binding-win32-ia32-msvc": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-ia32-msvc/-/resolver-binding-win32-ia32-msvc-1.11.1.tgz", - "integrity": "sha512-DCEI6t5i1NmAZp6pFonpD5m7i6aFrpofcp4LA2i8IIq60Jyo28hamKBxNrZcyOwVOZkgsRp9O2sXWBWP8MnvIQ==", - "cpu": [ - "ia32" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@unrs/resolver-binding-win32-x64-msvc": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-x64-msvc/-/resolver-binding-win32-x64-msvc-1.11.1.tgz", - "integrity": "sha512-lrW200hZdbfRtztbygyaq/6jP6AKE8qQN2KvPcJ+x7wiD038YtnYtZ82IMNJ69GJibV7bwL3y9FgK+5w/pYt6g==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@webassemblyjs/ast": { - "version": "1.12.1", - "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.12.1.tgz", - "integrity": "sha512-EKfMUOPRRUTy5UII4qJDGPpqfwjOmZ5jeGFwid9mnoqIFK+e0vqoi1qH56JpmZSzEL53jKnNzScdmftJyG5xWg==", - "dependencies": { - "@webassemblyjs/helper-numbers": "1.11.6", - "@webassemblyjs/helper-wasm-bytecode": "1.11.6" - } - }, - "node_modules/@webassemblyjs/floating-point-hex-parser": { - "version": "1.11.6", - "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.11.6.tgz", - "integrity": "sha512-ejAj9hfRJ2XMsNHk/v6Fu2dGS+i4UaXBXGemOfQ/JfQ6mdQg/WXtwleQRLLS4OvfDhv8rYnVwH27YJLMyYsxhw==" - }, - "node_modules/@webassemblyjs/helper-api-error": { - "version": "1.11.6", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.11.6.tgz", - "integrity": "sha512-o0YkoP4pVu4rN8aTJgAyj9hC2Sv5UlkzCHhxqWj8butaLvnpdc2jOwh4ewE6CX0txSfLn/UYaV/pheS2Txg//Q==" - }, - "node_modules/@webassemblyjs/helper-buffer": { - "version": "1.12.1", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.12.1.tgz", - "integrity": "sha512-nzJwQw99DNDKr9BVCOZcLuJJUlqkJh+kVzVl6Fmq/tI5ZtEyWT1KZMyOXltXLZJmDtvLCDgwsyrkohEtopTXCw==" - }, - "node_modules/@webassemblyjs/helper-numbers": { - "version": "1.11.6", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-numbers/-/helper-numbers-1.11.6.tgz", - "integrity": "sha512-vUIhZ8LZoIWHBohiEObxVm6hwP034jwmc9kuq5GdHZH0wiLVLIPcMCdpJzG4C11cHoQ25TFIQj9kaVADVX7N3g==", - "dependencies": { - "@webassemblyjs/floating-point-hex-parser": "1.11.6", - "@webassemblyjs/helper-api-error": "1.11.6", - "@xtuc/long": "4.2.2" - } - }, - "node_modules/@webassemblyjs/helper-wasm-bytecode": { - "version": "1.11.6", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.11.6.tgz", - "integrity": "sha512-sFFHKwcmBprO9e7Icf0+gddyWYDViL8bpPjJJl0WHxCdETktXdmtWLGVzoHbqUcY4Be1LkNfwTmXOJUFZYSJdA==" - }, - "node_modules/@webassemblyjs/helper-wasm-section": { - "version": "1.12.1", - "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.12.1.tgz", - "integrity": "sha512-Jif4vfB6FJlUlSbgEMHUyk1j234GTNG9dBJ4XJdOySoj518Xj0oGsNi59cUQF4RRMS9ouBUxDDdyBVfPTypa5g==", - "dependencies": { - "@webassemblyjs/ast": "1.12.1", - "@webassemblyjs/helper-buffer": "1.12.1", - "@webassemblyjs/helper-wasm-bytecode": "1.11.6", - "@webassemblyjs/wasm-gen": "1.12.1" - } + "node_modules/@webassemblyjs/helper-wasm-section": { + "version": "1.12.1", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.12.1.tgz", + "integrity": "sha512-Jif4vfB6FJlUlSbgEMHUyk1j234GTNG9dBJ4XJdOySoj518Xj0oGsNi59cUQF4RRMS9ouBUxDDdyBVfPTypa5g==", + "dependencies": { + "@webassemblyjs/ast": "1.12.1", + "@webassemblyjs/helper-buffer": "1.12.1", + "@webassemblyjs/helper-wasm-bytecode": "1.11.6", + "@webassemblyjs/wasm-gen": "1.12.1" + } }, "node_modules/@webassemblyjs/ieee754": { "version": "1.11.6", @@ -6362,7 +6177,8 @@ "version": "1.1.1", "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==", - "dev": true + "dev": true, + "license": "MIT" }, "node_modules/array-includes": { "version": "3.1.9", @@ -6437,28 +6253,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/array.prototype.findlastindex": { - "version": "1.2.6", - "resolved": "https://registry.npmjs.org/array.prototype.findlastindex/-/array.prototype.findlastindex-1.2.6.tgz", - "integrity": "sha512-F/TKATkzseUExPlfvmwQKGITM3DGTK+vkAsCZoDc5daVygbJBnjEUCbgkAvVFsgfXfX4YIqZ/27G3k3tdXrTxQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "call-bind": "^1.0.8", - "call-bound": "^1.0.4", - "define-properties": "^1.2.1", - "es-abstract": "^1.23.9", - "es-errors": "^1.3.0", - "es-object-atoms": "^1.1.1", - "es-shim-unscopables": "^1.1.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/array.prototype.flat": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/array.prototype.flat/-/array.prototype.flat-1.3.3.tgz", @@ -6536,6 +6330,21 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/asn1js": { + "version": "3.0.7", + "resolved": "https://registry.npmjs.org/asn1js/-/asn1js-3.0.7.tgz", + "integrity": "sha512-uLvq6KJu04qoQM6gvBfKFjlh6Gl0vOKQuR5cJMDHQkmwfMOQeN3F3SHCv9SNYSL+CRoHvOGFfllDlVz03GQjvQ==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "pvtsutils": "^1.3.6", + "pvutils": "^1.1.3", + "tslib": "^2.8.1" + }, + "engines": { + "node": ">=12.0.0" + } + }, "node_modules/async": { "version": "3.2.6", "resolved": "https://registry.npmjs.org/async/-/async-3.2.6.tgz", @@ -6629,13 +6438,14 @@ } }, "node_modules/axios": { - "version": "1.7.7", - "resolved": "https://registry.npmjs.org/axios/-/axios-1.7.7.tgz", - "integrity": "sha512-S4kL7XrjgBmvdGut0sN3yJxqYzrDOnivkBiN0OFs6hLiUam3UPvswUo0kqGyhqUZGEOytHyumEdXsAkgCOUf3Q==", + "version": "1.13.2", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.13.2.tgz", + "integrity": "sha512-VPk9ebNqPcy5lRGuSlKx752IlDatOjT9paPlm8A7yOuW2Fbvp4X3JznJtT4f0GzGLLiWE9W8onz51SqLYwzGaA==", "dev": true, + "license": "MIT", "dependencies": { "follow-redirects": "^1.15.6", - "form-data": "^4.0.0", + "form-data": "^4.0.4", "proxy-from-env": "^1.1.0" } }, @@ -7199,43 +7009,36 @@ } }, "node_modules/body-parser": { - "version": "1.20.3", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz", - "integrity": "sha512-7rAxByjUMqQ3/bHJy7D6OGXvx/MMc4IqBn/X0fcM1QUcAItpZrBEYhWGem+tzXH90c+G01ypMcYJBO9Y30203g==", + "version": "1.20.4", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.4.tgz", + "integrity": "sha512-ZTgYYLMOXY9qKU/57FAo8F+HA2dGX7bqGc71txDRC1rS4frdFI5R7NhluHxH6M0YItAP0sHB4uqAOcYKxO6uGA==", "dev": true, + "license": "MIT", "dependencies": { - "bytes": "3.1.2", + "bytes": "~3.1.2", "content-type": "~1.0.5", "debug": "2.6.9", "depd": "2.0.0", - "destroy": "1.2.0", - "http-errors": "2.0.0", - "iconv-lite": "0.4.24", - "on-finished": "2.4.1", - "qs": "6.13.0", - "raw-body": "2.5.2", + "destroy": "~1.2.0", + "http-errors": "~2.0.1", + "iconv-lite": "~0.4.24", + "on-finished": "~2.4.1", + "qs": "~6.14.0", + "raw-body": "~2.5.3", "type-is": "~1.6.18", - "unpipe": "1.0.0" + "unpipe": "~1.0.0" }, "engines": { "node": ">= 0.8", "npm": "1.2.8000 || >= 1.4.16" } }, - "node_modules/body-parser/node_modules/bytes": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", - "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==", - "dev": true, - "engines": { - "node": ">= 0.8" - } - }, "node_modules/body-parser/node_modules/debug": { "version": "2.6.9", "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", "dev": true, + "license": "MIT", "dependencies": { "ms": "2.0.0" } @@ -7245,6 +7048,7 @@ "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", "dev": true, + "license": "MIT", "dependencies": { "safer-buffer": ">= 2.1.2 < 3" }, @@ -7256,7 +7060,8 @@ "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", - "dev": true + "dev": true, + "license": "MIT" }, "node_modules/bonjour-service": { "version": "1.2.1", @@ -7274,10 +7079,11 @@ "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==" }, "node_modules/brace-expansion": { - "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", + "version": "1.1.12", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", + "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", "dev": true, + "license": "MIT", "dependencies": { "balanced-match": "^1.0.0", "concat-map": "0.0.1" @@ -7368,14 +7174,25 @@ } }, "node_modules/bytes": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz", - "integrity": "sha512-pMhOfFDPiv9t5jjIXkHosWmkSyQbvsgEVNkz0ERHbuLh2T/7j4Mqqpz523Fe8MVY89KC6Sh/QfS2sM+SjgFDcw==", + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", + "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.8" } }, + "node_modules/bytestreamjs": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/bytestreamjs/-/bytestreamjs-2.0.1.tgz", + "integrity": "sha512-U1Z/ob71V/bXfVABvNr/Kumf5VyeQRBEm6Txb0PQ6S7V5GpBM3w4Cbqz/xPDicR5tN0uvDifng8C+5qECeGwyQ==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=6.0.0" + } + }, "node_modules/call-bind": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.8.tgz", @@ -7399,7 +7216,6 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", - "dev": true, "license": "MIT", "dependencies": { "es-errors": "^1.3.0", @@ -7876,16 +7692,6 @@ "node": ">= 6" } }, - "node_modules/comment-parser": { - "version": "1.4.4", - "resolved": "https://registry.npmjs.org/comment-parser/-/comment-parser-1.4.4.tgz", - "integrity": "sha512-0D6qSQ5IkeRrGJFHRClzaMOenMeT0gErz3zIw3AprKMqhRN6LNU2jQOdkPG/FZ+8bCgXE1VidrgSzlBBDZRr8A==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 12.0.0" - } - }, "node_modules/common-path-prefix": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/common-path-prefix/-/common-path-prefix-3.0.0.tgz", @@ -7903,6 +7709,7 @@ "resolved": "https://registry.npmjs.org/compressible/-/compressible-2.0.18.tgz", "integrity": "sha512-AF3r7P5dWxL8MxyITRMlORQNaOA2IkAFaTr4k7BUumjPtRpGDTZpl0Pb1XCO6JeDCBdp126Cgs9sMxqSjgYyRg==", "dev": true, + "license": "MIT", "dependencies": { "mime-db": ">= 1.43.0 < 2" }, @@ -7911,17 +7718,18 @@ } }, "node_modules/compression": { - "version": "1.7.4", - "resolved": "https://registry.npmjs.org/compression/-/compression-1.7.4.tgz", - "integrity": "sha512-jaSIDzP9pZVS4ZfQ+TzvtiWhdpFhE2RDHz8QJkpX9SIpLq88VueF5jJw6t+6CUQcAoA6t+x89MLrWAqpfDE8iQ==", + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/compression/-/compression-1.8.1.tgz", + "integrity": "sha512-9mAqGPHLakhCLeNyxPkK4xVo746zQ/czLH1Ky+vkitMnWfWZps8r0qXuwhwizagCRttsL4lfG4pIOvaWLpAP0w==", "dev": true, + "license": "MIT", "dependencies": { - "accepts": "~1.3.5", - "bytes": "3.0.0", - "compressible": "~2.0.16", + "bytes": "3.1.2", + "compressible": "~2.0.18", "debug": "2.6.9", - "on-headers": "~1.0.2", - "safe-buffer": "5.1.2", + "negotiator": "~0.6.4", + "on-headers": "~1.1.0", + "safe-buffer": "5.2.1", "vary": "~1.1.2" }, "engines": { @@ -7933,6 +7741,7 @@ "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", "dev": true, + "license": "MIT", "dependencies": { "ms": "2.0.0" } @@ -7941,13 +7750,18 @@ "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", - "dev": true + "dev": true, + "license": "MIT" }, - "node_modules/compression/node_modules/safe-buffer": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", - "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", - "dev": true + "node_modules/compression/node_modules/negotiator": { + "version": "0.6.4", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.4.tgz", + "integrity": "sha512-myRT3DiWPHqho5PrJaIRyaMv2kgYf0mUVgBNOYMuCH5Ki1yEiQaf/ZJuQ62nvpc44wL5WDbTX7yGJi1Neevw8w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.6" + } }, "node_modules/concat-map": { "version": "0.0.1", @@ -7969,6 +7783,7 @@ "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz", "integrity": "sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==", "dev": true, + "license": "MIT", "dependencies": { "safe-buffer": "5.2.1" }, @@ -7981,6 +7796,7 @@ "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz", "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.6" } @@ -7992,19 +7808,21 @@ "dev": true }, "node_modules/cookie": { - "version": "0.7.1", - "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.1.tgz", - "integrity": "sha512-6DnInpx7SJ2AK3+CTUE/ZM0vWTUboZCegxhC2xiIydHR9jNuTAASBrfEpHhiGOZw/nX51bHt6YQl8jsGo4y/0w==", + "version": "0.7.2", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.2.tgz", + "integrity": "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.6" } }, "node_modules/cookie-signature": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz", - "integrity": "sha512-QADzlaHc8icV8I7vbaJXJwod9HWYp8uCqf1xa4OfNu1T7JVxQIrUgOWtHdNDtPiywmFbiS12VjotIXLrKM3orQ==", - "dev": true + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.7.tgz", + "integrity": "sha512-NXdYc3dLr47pBkpUCHtKSwIOQXLVn8dZEuywboCOJY/osA0wFSLlSawr3KN8qXJEyX66FcONTH8EIlVuK0yyFA==", + "dev": true, + "license": "MIT" }, "node_modules/copy-webpack-plugin": { "version": "11.0.0", @@ -8758,6 +8576,7 @@ "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz", "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.8" } @@ -8776,6 +8595,7 @@ "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz", "integrity": "sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.8", "npm": "1.2.8000 || >= 1.4.16" @@ -8842,10 +8662,11 @@ "dev": true }, "node_modules/diff": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.2.tgz", - "integrity": "sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==", + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.4.tgz", + "integrity": "sha512-X07nttJQkwkfKfvTPG/KSnE2OMdcUCao6+eXF3wmnIQRn2aPAHH3VxDbDOdegkd6JbPsXqShpvEOHfAT+nCNwQ==", "dev": true, + "license": "BSD-3-Clause", "engines": { "node": ">=0.3.1" } @@ -9000,7 +8821,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", - "dev": true, "license": "MIT", "dependencies": { "call-bind-apply-helpers": "^1.0.1", @@ -9075,6 +8895,7 @@ "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz", "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.8" } @@ -9284,7 +9105,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", - "dev": true, "license": "MIT", "engines": { "node": ">= 0.4" @@ -9294,7 +9114,6 @@ "version": "1.3.0", "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", - "dev": true, "engines": { "node": ">= 0.4" } @@ -9336,7 +9155,6 @@ "version": "1.1.1", "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", - "dev": true, "license": "MIT", "dependencies": { "es-errors": "^1.3.0" @@ -9349,7 +9167,6 @@ "version": "2.1.0", "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", - "dev": true, "license": "MIT", "dependencies": { "es-errors": "^1.3.0", @@ -9484,228 +9301,29 @@ } } }, - "node_modules/eslint-config-prettier": { - "version": "10.1.5", - "resolved": "https://registry.npmjs.org/eslint-config-prettier/-/eslint-config-prettier-10.1.5.tgz", - "integrity": "sha512-zc1UmCpNltmVY34vuLRV61r1K27sWuX39E+uyUnY8xS2Bex88VV9cugG+UZbRSRGtGyFboj+D8JODyme1plMpw==", - "dev": true, - "license": "MIT", - "bin": { - "eslint-config-prettier": "bin/cli.js" - }, - "funding": { - "url": "https://opencollective.com/eslint-config-prettier" - }, - "peerDependencies": { - "eslint": ">=7.0.0" - } - }, - "node_modules/eslint-import-context": { - "version": "0.1.9", - "resolved": "https://registry.npmjs.org/eslint-import-context/-/eslint-import-context-0.1.9.tgz", - "integrity": "sha512-K9Hb+yRaGAGUbwjhFNHvSmmkZs9+zbuoe3kFQ4V1wYjrepUFYM2dZAfNtjbbj3qsPfUfsA68Bx/ICWQMi+C8Eg==", - "dev": true, - "license": "MIT", - "dependencies": { - "get-tsconfig": "^4.10.1", - "stable-hash-x": "^0.2.0" - }, - "engines": { - "node": "^12.20.0 || ^14.18.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/eslint-import-context" - }, - "peerDependencies": { - "unrs-resolver": "^1.0.0" - }, - "peerDependenciesMeta": { - "unrs-resolver": { - "optional": true - } - } - }, - "node_modules/eslint-import-resolver-node": { - "version": "0.3.9", - "resolved": "https://registry.npmjs.org/eslint-import-resolver-node/-/eslint-import-resolver-node-0.3.9.tgz", - "integrity": "sha512-WFj2isz22JahUv+B788TlO3N6zL3nNJGU8CcZbPZvVEkBPaJdCV4vy5wyghty5ROFbCRnm132v8BScu5/1BQ8g==", - "dev": true, - "license": "MIT", - "dependencies": { - "debug": "^3.2.7", - "is-core-module": "^2.13.0", - "resolve": "^1.22.4" - } - }, - "node_modules/eslint-import-resolver-node/node_modules/debug": { - "version": "3.2.7", - "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", - "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "ms": "^2.1.1" - } - }, - "node_modules/eslint-module-utils": { - "version": "2.12.1", - "resolved": "https://registry.npmjs.org/eslint-module-utils/-/eslint-module-utils-2.12.1.tgz", - "integrity": "sha512-L8jSWTze7K2mTg0vos/RuLRS5soomksDPoJLXIslC7c8Wmut3bx7CPpJijDcBZtxQ5lrbUdM+s0OlNbz0DCDNw==", - "dev": true, - "license": "MIT", - "dependencies": { - "debug": "^3.2.7" - }, - "engines": { - "node": ">=4" - }, - "peerDependenciesMeta": { - "eslint": { - "optional": true - } - } - }, - "node_modules/eslint-module-utils/node_modules/debug": { - "version": "3.2.7", - "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", - "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "ms": "^2.1.1" - } - }, - "node_modules/eslint-plugin-i18n": { - "version": "2.4.0", - "resolved": "https://registry.npmjs.org/eslint-plugin-i18n/-/eslint-plugin-i18n-2.4.0.tgz", - "integrity": "sha512-6RpPoj+lr0xk6SNljziOjGfDtuQSN6cw/gdds248N5MvCQUrPxo5+0s7b7TQsEl1qLr5OVnCMxsaRBy/4T62cg==", - "dev": true, - "engines": { - "node": ">=12.0.0" - } - }, - "node_modules/eslint-plugin-import": { - "version": "2.32.0", - "resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.32.0.tgz", - "integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@rtsao/scc": "^1.1.0", - "array-includes": "^3.1.9", - "array.prototype.findlastindex": "^1.2.6", - "array.prototype.flat": "^1.3.3", - "array.prototype.flatmap": "^1.3.3", - "debug": "^3.2.7", - "doctrine": "^2.1.0", - "eslint-import-resolver-node": "^0.3.9", - "eslint-module-utils": "^2.12.1", - "hasown": "^2.0.2", - "is-core-module": "^2.16.1", - "is-glob": "^4.0.3", - "minimatch": "^3.1.2", - "object.fromentries": "^2.0.8", - "object.groupby": "^1.0.3", - "object.values": "^1.2.1", - "semver": "^6.3.1", - "string.prototype.trimend": "^1.0.9", - "tsconfig-paths": "^3.15.0" - }, - "engines": { - "node": ">=4" - }, - "peerDependencies": { - "eslint": "^2 || ^3 || ^4 || ^5 || ^6 || ^7.2.0 || ^8 || ^9" - } - }, - "node_modules/eslint-plugin-import-x": { - "version": "4.16.1", - "resolved": "https://registry.npmjs.org/eslint-plugin-import-x/-/eslint-plugin-import-x-4.16.1.tgz", - "integrity": "sha512-vPZZsiOKaBAIATpFE2uMI4w5IRwdv/FpQ+qZZMR4E+PeOcM4OeoEbqxRMnywdxP19TyB/3h6QBB0EWon7letSQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@typescript-eslint/types": "^8.35.0", - "comment-parser": "^1.4.1", - "debug": "^4.4.1", - "eslint-import-context": "^0.1.9", - "is-glob": "^4.0.3", - "minimatch": "^9.0.3 || ^10.0.1", - "semver": "^7.7.2", - "stable-hash-x": "^0.2.0", - "unrs-resolver": "^1.9.2" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "url": "https://opencollective.com/eslint-plugin-import-x" - }, - "peerDependencies": { - "@typescript-eslint/utils": "^8.0.0", - "eslint": "^8.57.0 || ^9.0.0", - "eslint-import-resolver-node": "*" - }, - "peerDependenciesMeta": { - "@typescript-eslint/utils": { - "optional": true - }, - "eslint-import-resolver-node": { - "optional": true - } - } - }, - "node_modules/eslint-plugin-import-x/node_modules/@typescript-eslint/types": { - "version": "8.53.1", - "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.53.1.tgz", - "integrity": "sha512-jr/swrr2aRmUAUjW5/zQHbMaui//vQlsZcJKijZf3M26bnmLj8LyZUpj8/Rd6uzaek06OWsqdofN/Thenm5O8A==", + "node_modules/eslint-config-prettier": { + "version": "10.1.5", + "resolved": "https://registry.npmjs.org/eslint-config-prettier/-/eslint-config-prettier-10.1.5.tgz", + "integrity": "sha512-zc1UmCpNltmVY34vuLRV61r1K27sWuX39E+uyUnY8xS2Bex88VV9cugG+UZbRSRGtGyFboj+D8JODyme1plMpw==", "dev": true, "license": "MIT", - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "bin": { + "eslint-config-prettier": "bin/cli.js" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - } - }, - "node_modules/eslint-plugin-import-x/node_modules/minimatch": { - "version": "10.1.1", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.1.1.tgz", - "integrity": "sha512-enIvLvRAFZYXJzkCYG5RKmPfrFArdLv+R+lbQ53BmIMLIry74bjKzX6iHAm8WYamJkhSSEabrWN5D97XnKObjQ==", - "dev": true, - "license": "BlueOak-1.0.0", - "dependencies": { - "@isaacs/brace-expansion": "^5.0.0" - }, - "engines": { - "node": "20 || >=22" + "url": "https://opencollective.com/eslint-config-prettier" }, - "funding": { - "url": "https://github.com/sponsors/isaacs" + "peerDependencies": { + "eslint": ">=7.0.0" } }, - "node_modules/eslint-plugin-import-x/node_modules/semver": { - "version": "7.7.3", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz", - "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==", + "node_modules/eslint-plugin-i18n": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-i18n/-/eslint-plugin-i18n-2.4.0.tgz", + "integrity": "sha512-6RpPoj+lr0xk6SNljziOjGfDtuQSN6cw/gdds248N5MvCQUrPxo5+0s7b7TQsEl1qLr5OVnCMxsaRBy/4T62cg==", "dev": true, - "license": "ISC", - "bin": { - "semver": "bin/semver.js" - }, "engines": { - "node": ">=10" - } - }, - "node_modules/eslint-plugin-import/node_modules/debug": { - "version": "3.2.7", - "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz", - "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "ms": "^2.1.1" + "node": ">=12.0.0" } }, "node_modules/eslint-plugin-prettier": { @@ -10015,6 +9633,7 @@ "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", "integrity": "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.6" } @@ -10090,45 +9709,50 @@ } }, "node_modules/express": { - "version": "4.21.1", - "resolved": "https://registry.npmjs.org/express/-/express-4.21.1.tgz", - "integrity": "sha512-YSFlK1Ee0/GC8QaO91tHcDxJiE/X4FbpAyQWkxAvG6AXCuR65YzK8ua6D9hvi/TzUfZMpc+BwuM1IPw8fmQBiQ==", + "version": "4.22.1", + "resolved": "https://registry.npmjs.org/express/-/express-4.22.1.tgz", + "integrity": "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==", "dev": true, + "license": "MIT", "dependencies": { "accepts": "~1.3.8", "array-flatten": "1.1.1", - "body-parser": "1.20.3", - "content-disposition": "0.5.4", + "body-parser": "~1.20.3", + "content-disposition": "~0.5.4", "content-type": "~1.0.4", - "cookie": "0.7.1", - "cookie-signature": "1.0.6", + "cookie": "~0.7.1", + "cookie-signature": "~1.0.6", "debug": "2.6.9", "depd": "2.0.0", "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "etag": "~1.8.1", - "finalhandler": "1.3.1", - "fresh": "0.5.2", - "http-errors": "2.0.0", + "finalhandler": "~1.3.1", + "fresh": "~0.5.2", + "http-errors": "~2.0.0", "merge-descriptors": "1.0.3", "methods": "~1.1.2", - "on-finished": "2.4.1", + "on-finished": "~2.4.1", "parseurl": "~1.3.3", - "path-to-regexp": "0.1.10", + "path-to-regexp": "~0.1.12", "proxy-addr": "~2.0.7", - "qs": "6.13.0", + "qs": "~6.14.0", "range-parser": "~1.2.1", "safe-buffer": "5.2.1", - "send": "0.19.0", - "serve-static": "1.16.2", + "send": "~0.19.0", + "serve-static": "~1.16.2", "setprototypeof": "1.2.0", - "statuses": "2.0.1", + "statuses": "~2.0.1", "type-is": "~1.6.18", "utils-merge": "1.0.1", "vary": "~1.1.2" }, "engines": { "node": ">= 0.10.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/express/node_modules/debug": { @@ -10136,6 +9760,7 @@ "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", "dev": true, + "license": "MIT", "dependencies": { "ms": "2.0.0" } @@ -10144,7 +9769,8 @@ "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", - "dev": true + "dev": true, + "license": "MIT" }, "node_modules/fast-deep-equal": { "version": "3.1.3", @@ -10380,10 +10006,11 @@ } }, "node_modules/filelist/node_modules/brace-expansion": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", - "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz", + "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==", "dev": true, + "license": "MIT", "dependencies": { "balanced-match": "^1.0.0" } @@ -10422,17 +10049,18 @@ } }, "node_modules/finalhandler": { - "version": "1.3.1", - "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.3.1.tgz", - "integrity": "sha512-6BN9trH7bp3qvnrRyzsBz+g3lZxTNZTbVO2EV1CS0WIcDbawYVdYvGflME/9QP0h0pYlCDBCTjYa9nZzMDpyxQ==", + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.3.2.tgz", + "integrity": "sha512-aA4RyPcd3badbdABGDuTXCMTtOneUCAYH/gxoYRTZlIJdF0YPWuGqiAsIrhNnnqdXGswYk6dGujem4w80UJFhg==", "dev": true, + "license": "MIT", "dependencies": { "debug": "2.6.9", "encodeurl": "~2.0.0", "escape-html": "~1.0.3", - "on-finished": "2.4.1", + "on-finished": "~2.4.1", "parseurl": "~1.3.3", - "statuses": "2.0.1", + "statuses": "~2.0.2", "unpipe": "~1.0.0" }, "engines": { @@ -10444,6 +10072,7 @@ "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", "dev": true, + "license": "MIT", "dependencies": { "ms": "2.0.0" } @@ -10452,7 +10081,8 @@ "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", - "dev": true + "dev": true, + "license": "MIT" }, "node_modules/find-cache-dir": { "version": "2.1.0", @@ -10682,12 +10312,15 @@ } }, "node_modules/form-data": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.1.tgz", - "integrity": "sha512-tzN8e4TX8+kkxGPK8D5u0FNmjPUjw3lwC9lSLxxoB/+GtsJG91CO8bSWy73APlgAZzZbXEYZJuxjkHH2w+Ezhw==", + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz", + "integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==", + "license": "MIT", "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", "mime-types": "^2.1.12" }, "engines": { @@ -10716,6 +10349,7 @@ "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.6" } @@ -10738,6 +10372,7 @@ "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", "integrity": "sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.6" } @@ -10793,7 +10428,6 @@ "version": "1.1.2", "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", - "dev": true, "funding": { "url": "https://github.com/sponsors/ljharb" } @@ -10873,7 +10507,6 @@ "version": "1.3.0", "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", - "dev": true, "license": "MIT", "dependencies": { "call-bind-apply-helpers": "^1.0.2", @@ -10907,7 +10540,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", - "dev": true, "license": "MIT", "dependencies": { "dunder-proto": "^1.0.1", @@ -10947,19 +10579,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/get-tsconfig": { - "version": "4.13.0", - "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.13.0.tgz", - "integrity": "sha512-1VKTZJCwBrvbd+Wn3AOgQP/2Av+TfTCOlE4AcRJE72W1ksZXbAx8PPBR9RzgTeSPzlPMHrbANMH3LbltH73wxQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "resolve-pkg-maps": "^1.0.0" - }, - "funding": { - "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" - } - }, "node_modules/glob": { "version": "7.2.3", "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz", @@ -11094,7 +10713,6 @@ "version": "1.2.0", "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", - "dev": true, "license": "MIT", "engines": { "node": ">= 0.4" @@ -11203,7 +10821,6 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", - "dev": true, "license": "MIT", "engines": { "node": ">= 0.4" @@ -11216,7 +10833,6 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", - "dev": true, "dependencies": { "has-symbols": "^1.0.3" }, @@ -11231,7 +10847,6 @@ "version": "2.0.2", "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", - "dev": true, "dependencies": { "function-bind": "^1.1.2" }, @@ -11440,19 +11055,24 @@ "dev": true }, "node_modules/http-errors": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz", - "integrity": "sha512-FtwrG/euBzaEjYeRqOgly7G0qviiXoJWnvEH2Z1plBdXgbyjv34pHTSb9zoeHMyDy33+DWy5Wt9Wo+TURtOYSQ==", + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.1.tgz", + "integrity": "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==", "dev": true, + "license": "MIT", "dependencies": { - "depd": "2.0.0", - "inherits": "2.0.4", - "setprototypeof": "1.2.0", - "statuses": "2.0.1", - "toidentifier": "1.0.1" + "depd": "~2.0.0", + "inherits": "~2.0.4", + "setprototypeof": "~1.2.0", + "statuses": "~2.0.2", + "toidentifier": "~1.0.1" }, "engines": { "node": ">= 0.8" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/http-parser-js": { @@ -11476,10 +11096,11 @@ } }, "node_modules/http-proxy-middleware": { - "version": "2.0.7", - "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.7.tgz", - "integrity": "sha512-fgVY8AV7qU7z/MmXJ/rxwbrtQH4jBQ9m7kp3llF0liB7glmFeVZFBepQb32T3y8n8k2+AEYuMPCpinYW+/CuRA==", + "version": "2.0.9", + "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.9.tgz", + "integrity": "sha512-c1IyJYLYppU574+YI7R4QyX2ystMtVXZwIdzazUIPIJsHuWNd+mho2j+bKoHftndicGj9yh+xjd+l0yj7VeT1Q==", "dev": true, + "license": "MIT", "dependencies": { "@types/http-proxy": "^1.17.8", "http-proxy": "^1.18.1", @@ -13413,9 +13034,9 @@ "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==" }, "node_modules/js-yaml": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz", - "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==", + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz", + "integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==", "license": "MIT", "dependencies": { "argparse": "^2.0.1" @@ -13793,14 +13414,16 @@ } }, "node_modules/lodash": { - "version": "4.17.21", - "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", - "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==" + "version": "4.17.23", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.23.tgz", + "integrity": "sha512-LgVTMpQtIopCi79SJeDiP0TfWi5CNEc/L/aRdTh3yIvmZXTnheWpKjSZhnvMl8iXbC1tFg9gdHHDMLoV7CnG+w==", + "license": "MIT" }, "node_modules/lodash-es": { - "version": "4.17.21", - "resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.17.21.tgz", - "integrity": "sha512-mKnC+QJ9pWVzv+C4/U3rRsHapFfHvQFoFB92e52xeyGMcX6/OlIl78je1u8vePzYZSkkogMPJ2yjxxsb89cxyw==" + "version": "4.17.23", + "resolved": "https://registry.npmjs.org/lodash-es/-/lodash-es-4.17.23.tgz", + "integrity": "sha512-kVI48u3PZr38HdYz98UmfPnXl2DXrpdctLrFLCd3kOx1xUkOmpFPx7gCWWM5MPkL/fD8zb+Ph0QzjGFs4+hHWg==", + "license": "MIT" }, "node_modules/lodash.camelcase": { "version": "4.3.0", @@ -14082,7 +13705,6 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", - "dev": true, "license": "MIT", "engines": { "node": ">= 0.4" @@ -14108,6 +13730,7 @@ "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.6" } @@ -14129,6 +13752,7 @@ "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz", "integrity": "sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==", "dev": true, + "license": "MIT", "funding": { "url": "https://github.com/sponsors/sindresorhus" } @@ -14152,6 +13776,7 @@ "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz", "integrity": "sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.6" } @@ -14174,6 +13799,7 @@ "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==", "dev": true, + "license": "MIT", "bin": { "mime": "cli.js" }, @@ -14332,15 +13958,16 @@ "integrity": "sha512-wynEP02LmIbLpcYw8uBKpcfF6dmg2vcpKqxeH5UcoKEYdExslsdUA4ugFauuaeYdTB76ez6gJW8XAZ6CgkXYxA==" }, "node_modules/nanoid": { - "version": "3.3.7", - "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.7.tgz", - "integrity": "sha512-eSRppjcPIatRIMC1U6UngP8XFcz8MQWGQdt1MTBQ7NaAmvXDfvNxbvWV3x2y6CdEUciCSsDHDQZbhYaB8QEo2g==", + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", "funding": [ { "type": "github", "url": "https://github.com/sponsors/ai" } ], + "license": "MIT", "bin": { "nanoid": "bin/nanoid.cjs" }, @@ -14348,22 +13975,6 @@ "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" } }, - "node_modules/napi-postinstall": { - "version": "0.3.4", - "resolved": "https://registry.npmjs.org/napi-postinstall/-/napi-postinstall-0.3.4.tgz", - "integrity": "sha512-PHI5f1O0EP5xJ9gQmFGMS6IZcrVvTjpXjz7Na41gTE7eE2hK11lg04CECCYEEjdc17EV4DO+fkGEtt7TpTaTiQ==", - "dev": true, - "license": "MIT", - "bin": { - "napi-postinstall": "lib/cli.js" - }, - "engines": { - "node": "^12.20.0 || ^14.18.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/napi-postinstall" - } - }, "node_modules/natural-compare": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", @@ -14478,15 +14089,6 @@ "node": "4.x || >=6.0.0" } }, - "node_modules/node-forge": { - "version": "1.3.1", - "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-1.3.1.tgz", - "integrity": "sha512-dPEtOeMvF9VMcYV/1Wb8CPoVAXtp6MKMlcbAt4ddqmGqUJ6fQZFXkNZNkNlfevtNkGtaSoXf/vNNNSvgrdXwtA==", - "dev": true, - "engines": { - "node": ">= 6.13.0" - } - }, "node_modules/node-int64": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz", @@ -19793,21 +19395,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/object.groupby": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/object.groupby/-/object.groupby-1.0.3.tgz", - "integrity": "sha512-+Lhy3TQTuzXI5hevh8sBGqbmurHbbIjAi0Z4S63nthVLmLxfbj4T54a4CfZrXIrt9iP4mVAPYMo/v99taj3wjQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "call-bind": "^1.0.7", - "define-properties": "^1.2.1", - "es-abstract": "^1.23.2" - }, - "engines": { - "node": ">= 0.4" - } - }, "node_modules/object.values": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/object.values/-/object.values-1.2.1.tgz", @@ -19846,10 +19433,11 @@ } }, "node_modules/on-headers": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/on-headers/-/on-headers-1.0.2.tgz", - "integrity": "sha512-pZAE+FJLoyITytdqK0U5s+FIpjN0JP3OzFi/u8Rx+EV5/W+JTWGXG8xFzevE7AjBfDqHv/8vL8qQsIhHnqRkrA==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/on-headers/-/on-headers-1.1.0.tgz", + "integrity": "sha512-737ZY3yNnXy37FHkQxPzt4UZ2UWPWiCZWLvFZ4fu5cueciegX0zGPnrlY6bwRg4FdQOe9YU8MkmJwGhoMybl8A==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.8" } @@ -20173,10 +19761,11 @@ "dev": true }, "node_modules/path-to-regexp": { - "version": "0.1.10", - "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.10.tgz", - "integrity": "sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w==", - "dev": true + "version": "0.1.12", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.12.tgz", + "integrity": "sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ==", + "dev": true, + "license": "MIT" }, "node_modules/path-type": { "version": "4.0.0", @@ -20386,6 +19975,24 @@ "node": ">=4" } }, + "node_modules/pkijs": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/pkijs/-/pkijs-3.3.3.tgz", + "integrity": "sha512-+KD8hJtqQMYoTuL1bbGOqxb4z+nZkTAwVdNtWwe8Tc2xNbEmdJYIYoc6Qt0uF55e6YW6KuTHw1DjQ18gMhzepw==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "@noble/hashes": "1.4.0", + "asn1js": "^3.0.6", + "bytestreamjs": "^2.0.1", + "pvtsutils": "^1.3.6", + "pvutils": "^1.1.3", + "tslib": "^2.8.1" + }, + "engines": { + "node": ">=16.0.0" + } + }, "node_modules/possible-typed-array-names": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.1.0.tgz", @@ -21604,6 +21211,7 @@ "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", "integrity": "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==", "dev": true, + "license": "MIT", "dependencies": { "forwarded": "0.2.0", "ipaddr.js": "1.9.1" @@ -21617,6 +21225,7 @@ "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.10" } @@ -21651,13 +21260,34 @@ } ] }, + "node_modules/pvtsutils": { + "version": "1.3.6", + "resolved": "https://registry.npmjs.org/pvtsutils/-/pvtsutils-1.3.6.tgz", + "integrity": "sha512-PLgQXQ6H2FWCaeRak8vvk1GW462lMxB5s3Jm673N82zI4vqtVUPuZdffdZbPDFRoU8kAhItWFtPCWiPpp4/EDg==", + "dev": true, + "license": "MIT", + "dependencies": { + "tslib": "^2.8.1" + } + }, + "node_modules/pvutils": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/pvutils/-/pvutils-1.1.5.tgz", + "integrity": "sha512-KTqnxsgGiQ6ZAzZCVlJH5eOjSnvlyEgx1m8bkRJfOhmGRqfo5KLvmAlACQkrjEtOQ4B7wF9TdSLIs9O90MX9xA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=16.0.0" + } + }, "node_modules/qs": { - "version": "6.13.0", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz", - "integrity": "sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==", + "version": "6.14.1", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.14.1.tgz", + "integrity": "sha512-4EK3+xJl8Ts67nLYNwqw/dsFVnCf+qR7RgXSK9jEEm9unao3njwMDdmsdvoKBKHzxd7tCYz5e5M+SnMjdtXGQQ==", "dev": true, + "license": "BSD-3-Clause", "dependencies": { - "side-channel": "^1.0.6" + "side-channel": "^1.1.0" }, "engines": { "node": ">=0.6" @@ -21732,34 +21362,27 @@ } }, "node_modules/raw-body": { - "version": "2.5.2", - "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.2.tgz", - "integrity": "sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==", + "version": "2.5.3", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.3.tgz", + "integrity": "sha512-s4VSOf6yN0rvbRZGxs8Om5CWj6seneMwK3oDb4lWDH0UPhWcxwOWw5+qk24bxq87szX1ydrwylIOp2uG1ojUpA==", "dev": true, + "license": "MIT", "dependencies": { - "bytes": "3.1.2", - "http-errors": "2.0.0", - "iconv-lite": "0.4.24", - "unpipe": "1.0.0" + "bytes": "~3.1.2", + "http-errors": "~2.0.1", + "iconv-lite": "~0.4.24", + "unpipe": "~1.0.0" }, "engines": { "node": ">= 0.8" } }, - "node_modules/raw-body/node_modules/bytes": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", - "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==", - "dev": true, - "engines": { - "node": ">= 0.8" - } - }, "node_modules/raw-body/node_modules/iconv-lite": { "version": "0.4.24", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", "dev": true, + "license": "MIT", "dependencies": { "safer-buffer": ">= 2.1.2 < 3" }, @@ -22042,11 +21665,12 @@ } }, "node_modules/react-router": { - "version": "6.27.0", - "resolved": "https://registry.npmjs.org/react-router/-/react-router-6.27.0.tgz", - "integrity": "sha512-YA+HGZXz4jaAkVoYBE98VQl+nVzI+cVI2Oj/06F5ZM+0u3TgedN9Y9kmMRo2mnkSK2nCpNQn0DVob4HCsY/WLw==", + "version": "6.30.3", + "resolved": "https://registry.npmjs.org/react-router/-/react-router-6.30.3.tgz", + "integrity": "sha512-XRnlbKMTmktBkjCLE8/XcZFlnHvr2Ltdr1eJX4idL55/9BbORzyZEaIkBFDhFGCEWBBItsVrDxwx3gnisMitdw==", + "license": "MIT", "dependencies": { - "@remix-run/router": "1.20.0" + "@remix-run/router": "1.23.2" }, "engines": { "node": ">=14.0.0" @@ -22056,12 +21680,13 @@ } }, "node_modules/react-router-dom": { - "version": "6.27.0", - "resolved": "https://registry.npmjs.org/react-router-dom/-/react-router-dom-6.27.0.tgz", - "integrity": "sha512-+bvtFWMC0DgAFrfKXKG9Fc+BcXWRUO1aJIihbB79xaeq0v5UzfvnM5houGUm1Y461WVRcgAQ+Clh5rdb1eCx4g==", + "version": "6.30.3", + "resolved": "https://registry.npmjs.org/react-router-dom/-/react-router-dom-6.30.3.tgz", + "integrity": "sha512-pxPcv1AczD4vso7G4Z3TKcvlxK7g7TNt3/FNGMhfqyntocvYKj+GCatfigGDjbLozC4baguJ0ReCigoDJXb0ag==", + "license": "MIT", "dependencies": { - "@remix-run/router": "1.20.0", - "react-router": "6.27.0" + "@remix-run/router": "1.23.2", + "react-router": "6.30.3" }, "engines": { "node": ">=14.0.0" @@ -22197,6 +21822,13 @@ "resolved": "https://registry.npmjs.org/redux/-/redux-5.0.1.tgz", "integrity": "sha512-M9/ELqF6fy8FwmkpnF0S3YKOqMyoWJ4+CS5Efg2ct3oY9daQvd/Pc71FpGZsVsbl3Cpb+IIcjBDUnnyBdQbq4w==" }, + "node_modules/reflect-metadata": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/reflect-metadata/-/reflect-metadata-0.2.2.tgz", + "integrity": "sha512-urBwgfrvVP/eAyXx4hluJivBKzuEbSQs9rKWCrCkbSxNv8mxPcUZKeuoF3Uy4mJl3Lwprp6yy5/39VWigZ4K6Q==", + "dev": true, + "license": "Apache-2.0" + }, "node_modules/reflect.getprototypeof": { "version": "1.0.10", "resolved": "https://registry.npmjs.org/reflect.getprototypeof/-/reflect.getprototypeof-1.0.10.tgz", @@ -22247,11 +21879,6 @@ "node": ">=4" } }, - "node_modules/regenerator-runtime": { - "version": "0.14.1", - "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz", - "integrity": "sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==" - }, "node_modules/regenerator-transform": { "version": "0.15.2", "resolved": "https://registry.npmjs.org/regenerator-transform/-/regenerator-transform-0.15.2.tgz", @@ -22512,16 +22139,6 @@ "node": ">=4" } }, - "node_modules/resolve-pkg-maps": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz", - "integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==", - "dev": true, - "license": "MIT", - "funding": { - "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" - } - }, "node_modules/resolve-url-loader": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/resolve-url-loader/-/resolve-url-loader-5.0.0.tgz", @@ -22909,16 +22526,17 @@ "dev": true }, "node_modules/selfsigned": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/selfsigned/-/selfsigned-2.4.1.tgz", - "integrity": "sha512-th5B4L2U+eGLq1TVh7zNRGBapioSORUeymIydxgFpwww9d2qyKvtuPU2jJuHvYAwwqi2Y596QBL3eEqcPEYL8Q==", + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/selfsigned/-/selfsigned-5.5.0.tgz", + "integrity": "sha512-ftnu3TW4+3eBfLRFnDEkzGxSF/10BJBkaLJuBHZX0kiPS7bRdlpZGu6YGt4KngMkdTwJE6MbjavFpqHvqVt+Ew==", "dev": true, + "license": "MIT", "dependencies": { - "@types/node-forge": "^1.3.0", - "node-forge": "^1" + "@peculiar/x509": "^1.14.2", + "pkijs": "^3.3.3" }, "engines": { - "node": ">=10" + "node": ">=18" } }, "node_modules/semver": { @@ -22931,24 +22549,25 @@ } }, "node_modules/send": { - "version": "0.19.0", - "resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz", - "integrity": "sha512-dW41u5VfLXu8SJh5bwRmyYUbAoSB3c9uQh6L8h/KtsFREPWpbX1lrljJo186Jc4nmci/sGUZ9a0a0J2zgfq2hw==", + "version": "0.19.2", + "resolved": "https://registry.npmjs.org/send/-/send-0.19.2.tgz", + "integrity": "sha512-VMbMxbDeehAxpOtWJXlcUS5E8iXh6QmN+BkRX1GARS3wRaXEEgzCcB10gTQazO42tpNIya8xIyNx8fll1OFPrg==", "dev": true, + "license": "MIT", "dependencies": { "debug": "2.6.9", "depd": "2.0.0", "destroy": "1.2.0", - "encodeurl": "~1.0.2", + "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "etag": "~1.8.1", - "fresh": "0.5.2", - "http-errors": "2.0.0", + "fresh": "~0.5.2", + "http-errors": "~2.0.1", "mime": "1.6.0", "ms": "2.1.3", - "on-finished": "2.4.1", + "on-finished": "~2.4.1", "range-parser": "~1.2.1", - "statuses": "2.0.1" + "statuses": "~2.0.2" }, "engines": { "node": ">= 0.8.0" @@ -22959,6 +22578,7 @@ "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", "dev": true, + "license": "MIT", "dependencies": { "ms": "2.0.0" } @@ -22967,16 +22587,8 @@ "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", - "dev": true - }, - "node_modules/send/node_modules/encodeurl": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", - "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==", "dev": true, - "engines": { - "node": ">= 0.8" - } + "license": "MIT" }, "node_modules/serialize-javascript": { "version": "6.0.2", @@ -23065,15 +22677,16 @@ } }, "node_modules/serve-static": { - "version": "1.16.2", - "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.2.tgz", - "integrity": "sha512-VqpjJZKadQB/PEbEwvFdO43Ax5dFBZ2UECszz8bQ7pi7wt//PWe1P6MN7eCnjsatYtBT6EuiClbjSWP2WrIoTw==", + "version": "1.16.3", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.3.tgz", + "integrity": "sha512-x0RTqQel6g5SY7Lg6ZreMmsOzncHFU7nhnRWkKgWuMTu5NN0DR5oruckMqRvacAN9d5w6ARnRBXl9xhDCgfMeA==", "dev": true, + "license": "MIT", "dependencies": { "encodeurl": "~2.0.0", "escape-html": "~1.0.3", "parseurl": "~1.3.3", - "send": "0.19.0" + "send": "~0.19.1" }, "engines": { "node": ">= 0.8.0" @@ -23131,7 +22744,8 @@ "version": "1.2.0", "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz", "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==", - "dev": true + "dev": true, + "license": "ISC" }, "node_modules/shallow-clone": { "version": "3.0.1", @@ -23539,16 +23153,6 @@ "integrity": "sha512-ji9qxRnOVfcuLDySj9qzhGSEFVobyt1kIOSkj1qZzYLzq7Tos/oUUWvotUPQLlrsidqsK6tBH89Bc9kL5zHA6w==", "deprecated": "Modern JS already guarantees Array#sort() is a stable sort, so this library is deprecated. See the compatibility table on MDN: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/sort#browser_compatibility" }, - "node_modules/stable-hash-x": { - "version": "0.2.0", - "resolved": "https://registry.npmjs.org/stable-hash-x/-/stable-hash-x-0.2.0.tgz", - "integrity": "sha512-o3yWv49B/o4QZk5ZcsALc6t0+eCelPc44zZsLtCQnZPDwFpDYSWcDnrv2TtMmMbQ7uKo3J0HTURCqckw23czNQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=12.0.0" - } - }, "node_modules/stack-utils": { "version": "2.0.6", "resolved": "https://registry.npmjs.org/stack-utils/-/stack-utils-2.0.6.tgz", @@ -23577,10 +23181,11 @@ "dev": true }, "node_modules/statuses": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz", - "integrity": "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.2.tgz", + "integrity": "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.8" } @@ -24258,6 +23863,7 @@ "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz", "integrity": "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==", "dev": true, + "license": "MIT", "engines": { "node": ">=0.6" } @@ -24404,46 +24010,31 @@ } } }, - "node_modules/tsconfig-paths": { - "version": "3.15.0", - "resolved": "https://registry.npmjs.org/tsconfig-paths/-/tsconfig-paths-3.15.0.tgz", - "integrity": "sha512-2Ac2RgzDe/cn48GvOe3M+o82pEFewD3UPbyoUHHdKasHwJKjds4fLXWf/Ux5kATBKN20oaFGu+jbElp1pos0mg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/json5": "^0.0.29", - "json5": "^1.0.2", - "minimist": "^1.2.6", - "strip-bom": "^3.0.0" - } + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "license": "0BSD" }, - "node_modules/tsconfig-paths/node_modules/json5": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/json5/-/json5-1.0.2.tgz", - "integrity": "sha512-g1MWMLBiz8FKi1e4w0UyVL3w+iJceWAFBAaBnnGKOpNa5f8TLktkbre1+s6oICydWAm+HRUGTmI+//xv2hvXYA==", + "node_modules/tsyringe": { + "version": "4.10.0", + "resolved": "https://registry.npmjs.org/tsyringe/-/tsyringe-4.10.0.tgz", + "integrity": "sha512-axr3IdNuVIxnaK5XGEUFTu3YmAQ6lllgrvqfEoR16g/HGnYY/6We4oWENtAnzK6/LpJ2ur9PAb80RBt7/U4ugw==", "dev": true, "license": "MIT", "dependencies": { - "minimist": "^1.2.0" + "tslib": "^1.9.3" }, - "bin": { - "json5": "lib/cli.js" - } - }, - "node_modules/tsconfig-paths/node_modules/strip-bom": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", - "integrity": "sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA==", - "dev": true, - "license": "MIT", "engines": { - "node": ">=4" + "node": ">= 6.0.0" } }, - "node_modules/tslib": { - "version": "2.8.0", - "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.0.tgz", - "integrity": "sha512-jWVzBLplnCmoaTr13V9dYbiQ99wvZRd0vNWaDRg+aVYRcjDF3nDksxFDE/+fkXnKhpnUUkmx5pK/v8mCtLVqZA==" + "node_modules/tsyringe/node_modules/tslib": { + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz", + "integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==", + "dev": true, + "license": "0BSD" }, "node_modules/type-check": { "version": "0.4.0", @@ -24483,6 +24074,7 @@ "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz", "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==", "dev": true, + "license": "MIT", "dependencies": { "media-typer": "0.3.0", "mime-types": "~2.1.24" @@ -24602,10 +24194,11 @@ } }, "node_modules/undici": { - "version": "6.20.1", - "resolved": "https://registry.npmjs.org/undici/-/undici-6.20.1.tgz", - "integrity": "sha512-AjQF1QsmqfJys+LXfGTNum+qw4S88CojRInG/6t31W/1fk6G59s92bnAvGz5Cmur+kQv2SURXEvvudLmbrE8QA==", + "version": "6.23.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.23.0.tgz", + "integrity": "sha512-VfQPToRA5FZs/qJxLIinmU59u0r7LXqoJkCzinq3ckNJp3vKEh7jTWN589YQ5+aoAC/TGRLyJLCPKcLQbM8r9g==", "dev": true, + "license": "MIT", "engines": { "node": ">=18.17" } @@ -24669,45 +24262,11 @@ "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", "integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.8" } }, - "node_modules/unrs-resolver": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/unrs-resolver/-/unrs-resolver-1.11.1.tgz", - "integrity": "sha512-bSjt9pjaEBnNiGgc9rUiHGKv5l4/TGzDmYw3RhnkJGtLhbnnA/5qJj7x3dNDCRx/PJxu774LlH8lCOlB4hEfKg==", - "dev": true, - "hasInstallScript": true, - "license": "MIT", - "dependencies": { - "napi-postinstall": "^0.3.0" - }, - "funding": { - "url": "https://opencollective.com/unrs-resolver" - }, - "optionalDependencies": { - "@unrs/resolver-binding-android-arm-eabi": "1.11.1", - "@unrs/resolver-binding-android-arm64": "1.11.1", - "@unrs/resolver-binding-darwin-arm64": "1.11.1", - "@unrs/resolver-binding-darwin-x64": "1.11.1", - "@unrs/resolver-binding-freebsd-x64": "1.11.1", - "@unrs/resolver-binding-linux-arm-gnueabihf": "1.11.1", - "@unrs/resolver-binding-linux-arm-musleabihf": "1.11.1", - "@unrs/resolver-binding-linux-arm64-gnu": "1.11.1", - "@unrs/resolver-binding-linux-arm64-musl": "1.11.1", - "@unrs/resolver-binding-linux-ppc64-gnu": "1.11.1", - "@unrs/resolver-binding-linux-riscv64-gnu": "1.11.1", - "@unrs/resolver-binding-linux-riscv64-musl": "1.11.1", - "@unrs/resolver-binding-linux-s390x-gnu": "1.11.1", - "@unrs/resolver-binding-linux-x64-gnu": "1.11.1", - "@unrs/resolver-binding-linux-x64-musl": "1.11.1", - "@unrs/resolver-binding-wasm32-wasi": "1.11.1", - "@unrs/resolver-binding-win32-arm64-msvc": "1.11.1", - "@unrs/resolver-binding-win32-ia32-msvc": "1.11.1", - "@unrs/resolver-binding-win32-x64-msvc": "1.11.1" - } - }, "node_modules/update-browserslist-db": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.1.tgz", @@ -24769,6 +24328,7 @@ "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", "integrity": "sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.4.0" } @@ -24807,6 +24367,7 @@ "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", "integrity": "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==", "dev": true, + "license": "MIT", "engines": { "node": ">= 0.8" } @@ -25024,14 +24585,16 @@ } }, "node_modules/webpack-dev-server": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/webpack-dev-server/-/webpack-dev-server-5.1.0.tgz", - "integrity": "sha512-aQpaN81X6tXie1FoOB7xlMfCsN19pSvRAeYUHOdFWOlhpQ/LlbfTqYwwmEDFV0h8GGuqmCmKmT+pxcUV/Nt2gQ==", + "version": "5.2.3", + "resolved": "https://registry.npmjs.org/webpack-dev-server/-/webpack-dev-server-5.2.3.tgz", + "integrity": "sha512-9Gyu2F7+bg4Vv+pjbovuYDhHX+mqdqITykfzdM9UyKqKHlsE5aAjRhR+oOEfXW5vBeu8tarzlJFIZva4ZjAdrQ==", "dev": true, + "license": "MIT", "dependencies": { "@types/bonjour": "^3.5.13", "@types/connect-history-api-fallback": "^1.5.4", - "@types/express": "^4.17.21", + "@types/express": "^4.17.25", + "@types/express-serve-static-core": "^4.17.21", "@types/serve-index": "^1.9.4", "@types/serve-static": "^1.15.5", "@types/sockjs": "^0.3.36", @@ -25040,18 +24603,17 @@ "bonjour-service": "^1.2.1", "chokidar": "^3.6.0", "colorette": "^2.0.10", - "compression": "^1.7.4", + "compression": "^1.8.1", "connect-history-api-fallback": "^2.0.0", - "express": "^4.19.2", + "express": "^4.22.1", "graceful-fs": "^4.2.6", - "html-entities": "^2.4.0", - "http-proxy-middleware": "^2.0.3", + "http-proxy-middleware": "^2.0.9", "ipaddr.js": "^2.1.0", "launch-editor": "^2.6.1", "open": "^10.0.3", "p-retry": "^6.2.0", "schema-utils": "^4.2.0", - "selfsigned": "^2.4.1", + "selfsigned": "^5.5.0", "serve-index": "^1.9.1", "sockjs": "^0.3.24", "spdy": "^4.0.2", @@ -25080,6 +24642,19 @@ } } }, + "node_modules/webpack-dev-server/node_modules/@types/express-serve-static-core": { + "version": "4.19.8", + "resolved": "https://registry.npmjs.org/@types/express-serve-static-core/-/express-serve-static-core-4.19.8.tgz", + "integrity": "sha512-02S5fmqeoKzVZCHPZid4b8JH2eM5HzQLZWN2FohQEy/0eXTq8VXZfSN6Pcr3F6N9R/vNrj7cpgbhjie6m/1tCA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*", + "@types/qs": "*", + "@types/range-parser": "*", + "@types/send": "*" + } + }, "node_modules/webpack-dev-server/node_modules/define-lazy-prop": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz", From d959828908bcf48dfeafb6df1b15b238fa829a33 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 23 Jan 2026 13:48:24 +0500 Subject: [PATCH 071/187] Implement pagination for `/api/project/list` and `/api/users/list` (#3489) * Implement pagination for /api/projects/list * Test /api/projects/list pagination * Update ProjectsAPIClient.list() * Return projects total_count * Fix APIClient.list() * Test APIClient.list() * Implement pagination for /api/users/list * Add name_pattern for projects * Add TestProjectsAPIClientList * Add server-side validation for project name * Allow _ in name_pattern * Add name_pattern for users * Add @overload for ProjectsAPIClient.list() --- src/dstack/_internal/core/models/projects.py | 12 +- src/dstack/_internal/core/models/users.py | 12 +- .../_internal/server/routers/projects.py | 16 +- src/dstack/_internal/server/routers/users.py | 32 +++- src/dstack/_internal/server/schemas/fleets.py | 6 +- .../_internal/server/schemas/projects.py | 40 +++- src/dstack/_internal/server/schemas/users.py | 48 ++++- .../_internal/server/services/projects.py | 122 ++++++++---- src/dstack/_internal/server/services/users.py | 86 ++++++++- src/dstack/api/server/_projects.py | 76 +++++++- src/dstack/api/server/_users.py | 50 ++++- .../_internal/server/routers/test_projects.py | 180 +++++++++++++++++- .../_internal/server/routers/test_users.py | 160 +++++++++++++++- src/tests/api/common.py | 29 +++ src/tests/api/test_projects.py | 62 ++++++ src/tests/api/test_users.py | 54 ++++++ 16 files changed, 903 insertions(+), 82 deletions(-) create mode 100644 src/tests/api/common.py create mode 100644 src/tests/api/test_projects.py create mode 100644 src/tests/api/test_users.py diff --git a/src/dstack/_internal/core/models/projects.py b/src/dstack/_internal/core/models/projects.py index 9748ece1ae..63adf91962 100644 --- a/src/dstack/_internal/core/models/projects.py +++ b/src/dstack/_internal/core/models/projects.py @@ -1,5 +1,5 @@ from datetime import datetime -from typing import List, Optional +from typing import List, Optional, Union from pydantic import UUID4 @@ -28,6 +28,16 @@ class Project(CoreModel): is_public: bool = False +class ProjectsInfoList(CoreModel): + total_count: Optional[int] = None + projects: List[Project] + + +# For backward compatibility with 0.20 clients, endpoints return `List[Project]` if `total_count` is None. +# TODO: Replace with ProjectsInfoList in 0.21. +ProjectsInfoListOrProjectsList = Union[List[Project], ProjectsInfoList] + + class ProjectHookConfig(CoreModel): """ This class can be inherited to extend the project creation configuration passed to the hooks. diff --git a/src/dstack/_internal/core/models/users.py b/src/dstack/_internal/core/models/users.py index 99fb8823e3..8e70e092d6 100644 --- a/src/dstack/_internal/core/models/users.py +++ b/src/dstack/_internal/core/models/users.py @@ -1,6 +1,6 @@ import enum from datetime import datetime -from typing import Optional +from typing import List, Optional, Union from pydantic import UUID4 @@ -42,6 +42,16 @@ class UserWithCreds(User): ssh_private_key: Optional[str] = None +class UsersInfoList(CoreModel): + total_count: Optional[int] = None + users: List[User] + + +# For backward compatibility with 0.20 clients, endpoints return `List[User]` if `total_count` is None. +# TODO: Replace with UsersInfoList in 0.21. +UsersInfoListOrUsersList = Union[List[User], UsersInfoList] + + class UserHookConfig(CoreModel): """ This class can be inherited to extend the user creation configuration passed to the hooks. diff --git a/src/dstack/_internal/server/routers/projects.py b/src/dstack/_internal/server/routers/projects.py index b07b7b1c62..aaf08809b7 100644 --- a/src/dstack/_internal/server/routers/projects.py +++ b/src/dstack/_internal/server/routers/projects.py @@ -3,7 +3,7 @@ from fastapi import APIRouter, Depends from sqlalchemy.ext.asyncio import AsyncSession -from dstack._internal.core.models.projects import Project +from dstack._internal.core.models.projects import Project, ProjectsInfoListOrProjectsList from dstack._internal.server.db import get_session from dstack._internal.server.models import ProjectModel, UserModel from dstack._internal.server.schemas.projects import ( @@ -36,14 +36,14 @@ ) -@router.post("/list", response_model=List[Project]) +@router.post("/list", response_model=ProjectsInfoListOrProjectsList) async def list_projects( body: Optional[ListProjectsRequest] = None, session: AsyncSession = Depends(get_session), user: UserModel = Depends(Authenticated()), ): """ - Returns projects visible to the user, sorted by ascending `created_at`. + Returns projects visible to the user. Returns all accessible projects (member projects for regular users, all non-deleted projects for global admins, plus public projects if `include_not_joined` is `True`). @@ -55,7 +55,15 @@ async def list_projects( body = ListProjectsRequest() return CustomORJSONResponse( await projects.list_user_accessible_projects( - session=session, user=user, include_not_joined=body.include_not_joined + session=session, + user=user, + include_not_joined=body.include_not_joined, + return_total_count=body.return_total_count, + name_pattern=body.name_pattern, + prev_created_at=body.prev_created_at, + prev_id=body.prev_id, + limit=body.limit, + ascending=body.ascending, ) ) diff --git a/src/dstack/_internal/server/routers/users.py b/src/dstack/_internal/server/routers/users.py index 6030416f50..6cd72f00a1 100644 --- a/src/dstack/_internal/server/routers/users.py +++ b/src/dstack/_internal/server/routers/users.py @@ -1,16 +1,17 @@ -from typing import List +from typing import Optional from fastapi import APIRouter, Depends from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.core.errors import ResourceNotExistsError -from dstack._internal.core.models.users import User, UserWithCreds +from dstack._internal.core.models.users import User, UsersInfoListOrUsersList, UserWithCreds from dstack._internal.server.db import get_session from dstack._internal.server.models import UserModel from dstack._internal.server.schemas.users import ( CreateUserRequest, DeleteUsersRequest, GetUserRequest, + ListUsersRequest, RefreshTokenRequest, UpdateUserRequest, ) @@ -28,12 +29,35 @@ ) -@router.post("/list", response_model=List[User]) +@router.post("/list", response_model=UsersInfoListOrUsersList) async def list_users( + body: Optional[ListUsersRequest] = None, session: AsyncSession = Depends(get_session), user: UserModel = Depends(Authenticated()), ): - return CustomORJSONResponse(await users.list_users_for_user(session=session, user=user)) + """ + Returns users visible to the user, sorted by descending `created_at`. + + Admins see all non-deleted users. Non-admins only see themselves. + + The results are paginated. To get the next page, pass `created_at` and `id` of + the last user from the previous page as `prev_created_at` and `prev_id`. + """ + if body is None: + # For backward compatibility + body = ListUsersRequest() + return CustomORJSONResponse( + await users.list_users_for_user( + session=session, + user=user, + return_total_count=body.return_total_count, + name_pattern=body.name_pattern, + prev_created_at=body.prev_created_at, + prev_id=body.prev_id, + limit=body.limit, + ascending=body.ascending, + ) + ) @router.post("/get_my_user", response_model=UserWithCreds) diff --git a/src/dstack/_internal/server/schemas/fleets.py b/src/dstack/_internal/server/schemas/fleets.py index ae66818ab9..3df43d12ce 100644 --- a/src/dstack/_internal/server/schemas/fleets.py +++ b/src/dstack/_internal/server/schemas/fleets.py @@ -9,10 +9,10 @@ class ListFleetsRequest(CoreModel): - project_name: Optional[str] + project_name: Optional[str] = None only_active: bool = False - prev_created_at: Optional[datetime] - prev_id: Optional[UUID] + prev_created_at: Optional[datetime] = None + prev_id: Optional[UUID] = None limit: int = Field(100, ge=0, le=100) ascending: bool = False diff --git a/src/dstack/_internal/server/schemas/projects.py b/src/dstack/_internal/server/schemas/projects.py index ec05c1fb47..5f0133ab72 100644 --- a/src/dstack/_internal/server/schemas/projects.py +++ b/src/dstack/_internal/server/schemas/projects.py @@ -1,4 +1,6 @@ -from typing import Annotated, List +from datetime import datetime +from typing import Annotated, List, Optional +from uuid import UUID from pydantic import Field @@ -8,8 +10,42 @@ class ListProjectsRequest(CoreModel): include_not_joined: Annotated[ - bool, Field(description="Include public projects where user is not a member") + bool, Field(description="Include public projects where user is not a member.") ] = True + return_total_count: Annotated[ + bool, Field(description="Return `total_count` with the total number of projects.") + ] = False + name_pattern: Annotated[ + Optional[str], + Field( + description="Include only projects with the name containing `name_pattern`.", + regex="^[a-zA-Z0-9-_]*$", + ), + ] = None + prev_created_at: Annotated[ + Optional[datetime], + Field( + description="Paginate projects by specifying `created_at` of the last (first) project in previous batch for descending (ascending)." + ), + ] = None + prev_id: Annotated[ + Optional[UUID], + Field( + description=( + "Paginate projects by specifying `id` of the last (first) project in previous batch for descending (ascending)." + " Must be used together with `prev_created_at`." + ) + ), + ] = None + limit: Annotated[ + int, Field(ge=0, le=2000, description="Limit number of projects returned.") + ] = 2000 + ascending: Annotated[ + bool, + Field( + description="Return projects sorted by `created_at` in ascending order. Defaults to descending." + ), + ] = False class CreateProjectRequest(CoreModel): diff --git a/src/dstack/_internal/server/schemas/users.py b/src/dstack/_internal/server/schemas/users.py index 6579d96572..574d5b093e 100644 --- a/src/dstack/_internal/server/schemas/users.py +++ b/src/dstack/_internal/server/schemas/users.py @@ -1,9 +1,55 @@ -from typing import List, Optional +from datetime import datetime +from typing import Annotated, List, Optional +from uuid import UUID + +from pydantic import Field from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.users import GlobalRole +class ListUsersRequest(CoreModel): + return_total_count: Annotated[ + bool, Field(description="Return `total_count` with the total number of users.") + ] = False + name_pattern: Annotated[ + Optional[str], + Field( + description="Include only users with the name containing `name_pattern`.", + regex="^[a-zA-Z0-9-_]*$", + ), + ] = None + prev_created_at: Annotated[ + Optional[datetime], + Field( + description=( + "Paginate users by specifying `created_at` of the last (first) user in previous " + "batch for descending (ascending)." + ) + ), + ] = None + prev_id: Annotated[ + Optional[UUID], + Field( + description=( + "Paginate users by specifying `id` of the last (first) user in previous batch " + "for descending (ascending). Must be used together with `prev_created_at`." + ) + ), + ] = None + limit: Annotated[int, Field(ge=0, le=2000, description="Limit number of users returned.")] = ( + 2000 + ) + ascending: Annotated[ + bool, + Field( + description=( + "Return users sorted by `created_at` in ascending order. Defaults to descending." + ) + ), + ] = False + + class GetUserRequest(CoreModel): username: str diff --git a/src/dstack/_internal/server/services/projects.py b/src/dstack/_internal/server/services/projects.py index 937247f5a1..2383594690 100644 --- a/src/dstack/_internal/server/services/projects.py +++ b/src/dstack/_internal/server/services/projects.py @@ -1,8 +1,10 @@ +import re import secrets import uuid +from datetime import datetime from typing import Awaitable, Callable, List, Optional, Tuple -from sqlalchemy import delete, select, update +from sqlalchemy import and_, delete, literal_column, or_, select, update from sqlalchemy import func as safunc from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import QueryableAttribute, joinedload, load_only @@ -19,6 +21,8 @@ MemberPermissions, Project, ProjectHookConfig, + ProjectsInfoList, + ProjectsInfoListOrProjectsList, ) from dstack._internal.core.models.runs import RunStatus from dstack._internal.core.models.users import GlobalRole, ProjectRole @@ -62,57 +66,87 @@ async def get_or_create_default_project( return default_project, True -async def list_user_projects( - session: AsyncSession, - user: UserModel, -) -> List[Project]: - """ - Returns projects where the user is a member or all projects for global admins. - """ - projects = await list_user_project_models( - session=session, - user=user, - ) - projects = sorted(projects, key=lambda p: p.created_at) - return [ - project_model_to_project(p, include_backends=False, include_members=False) - for p in projects - ] - - async def list_user_accessible_projects( session: AsyncSession, user: UserModel, include_not_joined: bool, -) -> List[Project]: + return_total_count: bool, + name_pattern: Optional[str], + prev_created_at: Optional[datetime], + prev_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> ProjectsInfoListOrProjectsList: """ Returns all projects accessible to the user: + - All projects for global admins - Projects where user is a member (public or private) - if `include_not_joined`: Public projects where user is NOT a member """ - if user.global_role == GlobalRole.ADMIN: - projects = await list_project_models(session=session) - else: - projects = await list_member_project_models(session=session, user=user) + filters = [ProjectModel.deleted == False] + if name_pattern: + name_pattern = name_pattern.replace("_", "/_") + filters.append(ProjectModel.name.ilike(f"%{name_pattern}%", escape="/")) + stmt = select(ProjectModel).where(*filters) + if user.global_role != GlobalRole.ADMIN: + stmt = stmt.outerjoin( + MemberModel, + onclause=and_( + MemberModel.project_id == ProjectModel.id, + MemberModel.user_id == user.id, + ), + ) if include_not_joined: - public_projects = await list_public_non_member_project_models( - session=session, user=user + stmt = stmt.where( + or_( + ProjectModel.is_public == True, + MemberModel.user_id.is_not(None), + ) ) - projects += public_projects - - projects = sorted(projects, key=lambda p: p.created_at) - return [ - project_model_to_project(p, include_backends=False, include_members=False) - for p in projects - ] - - -async def list_projects(session: AsyncSession) -> List[Project]: - projects = await list_project_models(session=session) - return [ + else: + stmt = stmt.where(MemberModel.user_id.is_not(None)) + pagination_filters = [] + if prev_created_at is not None: + if ascending: + if prev_id is None: + pagination_filters.append(ProjectModel.created_at > prev_created_at) + else: + pagination_filters.append( + or_( + ProjectModel.created_at > prev_created_at, + and_( + ProjectModel.created_at == prev_created_at, ProjectModel.id < prev_id + ), + ) + ) + else: + if prev_id is None: + pagination_filters.append(ProjectModel.created_at < prev_created_at) + else: + pagination_filters.append( + or_( + ProjectModel.created_at < prev_created_at, + and_( + ProjectModel.created_at == prev_created_at, ProjectModel.id > prev_id + ), + ) + ) + order_by = (ProjectModel.created_at.desc(), ProjectModel.id) + if ascending: + order_by = (ProjectModel.created_at.asc(), ProjectModel.id.desc()) + total_count = None + if return_total_count: + res = await session.execute(stmt.with_only_columns(safunc.count(literal_column("1")))) + total_count = res.scalar_one() + res = await session.execute(stmt.where(*pagination_filters).order_by(*order_by).limit(limit)) + project_models = res.unique().scalars().all() + projects = [ project_model_to_project(p, include_backends=False, include_members=False) - for p in projects + for p in project_models ] + if total_count is None: + return projects + return ProjectsInfoList(total_count=total_count, projects=projects) async def get_project_by_name( @@ -543,6 +577,7 @@ async def get_project_model_by_id_or_error( async def create_project_model( session: AsyncSession, owner: UserModel, project_name: str, is_public: bool = False ) -> ProjectModel: + validate_project_name(project_name) private_bytes, public_bytes = await run_async( generate_rsa_key_pair_bytes, f"{project_name}@dstack" ) @@ -649,6 +684,15 @@ def get_member_permissions(member_model: MemberModel) -> MemberPermissions: ) +def validate_project_name(project_name: str): + if not is_valid_project_name(project_name): + raise ServerClientError("Project name should match regex '^[a-zA-Z0-9-_]{1,50}$'") + + +def is_valid_project_name(project_name: str) -> bool: + return re.match("^[a-zA-Z0-9-_]{1,50}$", project_name) is not None + + _CREATE_PROJECT_HOOKS = [] diff --git a/src/dstack/_internal/server/services/users.py b/src/dstack/_internal/server/services/users.py index 3f8f6afa7b..73ceebe0ef 100644 --- a/src/dstack/_internal/server/services/users.py +++ b/src/dstack/_internal/server/services/users.py @@ -5,9 +5,10 @@ import uuid from collections.abc import AsyncGenerator from contextlib import asynccontextmanager +from datetime import datetime from typing import Awaitable, Callable, List, Optional, Tuple -from sqlalchemy import delete, select +from sqlalchemy import and_, delete, literal_column, or_, select from sqlalchemy import func as safunc from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import load_only @@ -21,6 +22,8 @@ User, UserHookConfig, UserPermissions, + UsersInfoList, + UsersInfoListOrUsersList, UserTokenCreds, UserWithCreds, ) @@ -55,23 +58,90 @@ async def get_or_create_admin_user(session: AsyncSession) -> Tuple[UserModel, bo async def list_users_for_user( session: AsyncSession, user: UserModel, -) -> List[User]: + return_total_count: bool, + name_pattern: Optional[str], + prev_created_at: Optional[datetime], + prev_id: Optional[uuid.UUID], + limit: int, + ascending: bool, +) -> UsersInfoListOrUsersList: if user.global_role == GlobalRole.ADMIN: - return await list_all_users(session=session) - return [user_model_to_user(user)] + return await list_all_users( + session=session, + include_deleted=False, + return_total_count=return_total_count, + name_pattern=name_pattern, + prev_created_at=prev_created_at, + prev_id=prev_id, + limit=limit, + ascending=ascending, + ) + users = [] + if not user.deleted and (name_pattern is None or name_pattern.lower() in user.name.lower()): + users.append(user_model_to_user(user)) + if return_total_count: + return UsersInfoList(total_count=len(users), users=users) + return users async def list_all_users( session: AsyncSession, include_deleted: bool = False, -) -> List[User]: + return_total_count: bool = False, + name_pattern: Optional[str] = None, + prev_created_at: Optional[datetime] = None, + prev_id: Optional[uuid.UUID] = None, + limit: int = 2000, + ascending: bool = False, +) -> UsersInfoListOrUsersList: filters = [] if not include_deleted: filters.append(UserModel.deleted == False) - res = await session.execute(select(UserModel).where(*filters)) + if name_pattern: + name_pattern = name_pattern.replace("_", "/_") + filters.append(UserModel.name.ilike(f"%{name_pattern}%", escape="/")) + stmt = select(UserModel).where(*filters) + pagination_filters = [] + if prev_created_at is not None: + if ascending: + if prev_id is None: + pagination_filters.append(UserModel.created_at > prev_created_at) + else: + pagination_filters.append( + or_( + UserModel.created_at > prev_created_at, + and_( + UserModel.created_at == prev_created_at, + UserModel.id < prev_id, + ), + ) + ) + else: + if prev_id is None: + pagination_filters.append(UserModel.created_at < prev_created_at) + else: + pagination_filters.append( + or_( + UserModel.created_at < prev_created_at, + and_( + UserModel.created_at == prev_created_at, + UserModel.id > prev_id, + ), + ) + ) + order_by = (UserModel.created_at.desc(), UserModel.id) + if ascending: + order_by = (UserModel.created_at.asc(), UserModel.id.desc()) + total_count = None + if return_total_count: + res = await session.execute(stmt.with_only_columns(safunc.count(literal_column("1")))) + total_count = res.scalar_one() + res = await session.execute(stmt.where(*pagination_filters).order_by(*order_by).limit(limit)) user_models = res.scalars().all() - user_models = sorted(user_models, key=lambda u: u.created_at) - return [user_model_to_user(u) for u in user_models] + users = [user_model_to_user(u) for u in user_models] + if total_count is None: + return users + return UsersInfoList(total_count=total_count, users=users) async def get_user_with_creds_by_name( diff --git a/src/dstack/api/server/_projects.py b/src/dstack/api/server/_projects.py index 31bdc3b2de..96a1f511f7 100644 --- a/src/dstack/api/server/_projects.py +++ b/src/dstack/api/server/_projects.py @@ -1,14 +1,20 @@ -from typing import List +import json +from datetime import datetime +from typing import Any, List, Literal, Optional, Union, overload +from uuid import UUID from pydantic import parse_obj_as -from dstack._internal.core.models.projects import Project +from dstack._internal.core.models.projects import ( + Project, + ProjectsInfoList, + ProjectsInfoListOrProjectsList, +) from dstack._internal.core.models.users import ProjectRole from dstack._internal.server.schemas.projects import ( AddProjectMemberRequest, CreateProjectRequest, DeleteProjectsRequest, - ListProjectsRequest, MemberSetting, RemoveProjectMemberRequest, SetProjectMembersRequest, @@ -17,10 +23,66 @@ class ProjectsAPIClient(APIClientGroup): - def list(self, include_not_joined: bool = True) -> List[Project]: - body = ListProjectsRequest(include_not_joined=include_not_joined) - resp = self._request("/api/projects/list", body=body.json()) - return parse_obj_as(List[Project.__response__], resp.json()) + @overload + def list( + self, + include_not_joined: bool = True, + *, + return_total_count: Literal[True], + name_pattern: Optional[str] = None, + prev_created_at: Optional[datetime] = None, + prev_id: Optional[UUID] = None, + limit: Optional[int] = None, + ascending: Optional[bool] = None, + ) -> ProjectsInfoList: + pass + + @overload + def list( + self, + include_not_joined: bool = True, + *, + return_total_count: Union[Literal[False], None] = None, + name_pattern: Optional[str] = None, + prev_created_at: Optional[datetime] = None, + prev_id: Optional[UUID] = None, + limit: Optional[int] = None, + ascending: Optional[bool] = None, + ) -> List[Project]: + pass + + def list( + self, + include_not_joined: bool = True, + *, + return_total_count: Optional[bool] = None, + name_pattern: Optional[str] = None, + prev_created_at: Optional[datetime] = None, + prev_id: Optional[UUID] = None, + limit: Optional[int] = None, + ascending: Optional[bool] = None, + ) -> ProjectsInfoListOrProjectsList: + # Passing only non-None fields for backward compatibility with 0.20 servers. + body: dict[str, Any] = { + "include_not_joined": include_not_joined, + } + if return_total_count is not None: + body["return_total_count"] = return_total_count + if name_pattern is not None: + body["name_pattern"] = name_pattern + if prev_created_at is not None: + body["prev_created_at"] = prev_created_at.isoformat() + if prev_id is not None: + body["prev_id"] = str(prev_id) + if limit is not None: + body["limit"] = limit + if ascending is not None: + body["ascending"] = ascending + resp = self._request("/api/projects/list", body=json.dumps(body)) + resp_json = resp.json() + if isinstance(resp_json, list): + return parse_obj_as(List[Project.__response__], resp_json) + return parse_obj_as(ProjectsInfoList, resp_json) def create(self, project_name: str, is_public: bool = False) -> Project: body = CreateProjectRequest(project_name=project_name, is_public=is_public) diff --git a/src/dstack/api/server/_users.py b/src/dstack/api/server/_users.py index 6082636c4b..885eae54a2 100644 --- a/src/dstack/api/server/_users.py +++ b/src/dstack/api/server/_users.py @@ -1,8 +1,18 @@ -from typing import List +import json +from datetime import datetime +from typing import Any, List, Optional +from uuid import UUID from pydantic import parse_obj_as +from pydantic.json import pydantic_encoder -from dstack._internal.core.models.users import GlobalRole, User, UserWithCreds +from dstack._internal.core.models.users import ( + GlobalRole, + User, + UsersInfoList, + UsersInfoListOrUsersList, + UserWithCreds, +) from dstack._internal.server.schemas.users import ( CreateUserRequest, GetUserRequest, @@ -13,9 +23,39 @@ class UsersAPIClient(APIClientGroup): - def list(self) -> List[User]: - resp = self._request("/api/users/list") - return parse_obj_as(List[User.__response__], resp.json()) + def list( + self, + return_total_count: Optional[bool] = None, + name_pattern: Optional[str] = None, + prev_created_at: Optional[datetime] = None, + prev_id: Optional[UUID] = None, + limit: Optional[int] = None, + ascending: Optional[bool] = None, + ) -> UsersInfoListOrUsersList: + # Passing only non-None fields for backward compatibility with 0.20 servers. + body: dict[str, Any] = {} + if return_total_count is not None: + body["return_total_count"] = return_total_count + if name_pattern is not None: + body["name_pattern"] = name_pattern + if prev_created_at is not None: + body["prev_created_at"] = prev_created_at + if prev_id is not None: + body["prev_id"] = prev_id + if limit is not None: + body["limit"] = limit + if ascending is not None: + body["ascending"] = ascending + if body: + resp = self._request( + "/api/users/list", body=json.dumps(body, default=pydantic_encoder) + ) + else: + resp = self._request("/api/users/list") + resp_json = resp.json() + if isinstance(resp_json, list): + return parse_obj_as(List[User.__response__], resp_json) + return parse_obj_as(UsersInfoList, resp_json) def get_my_user(self) -> UserWithCreds: resp = self._request("/api/users/get_my_user") diff --git a/src/tests/_internal/server/routers/test_projects.py b/src/tests/_internal/server/routers/test_projects.py index 5c9ef42ffb..2da65c7d5c 100644 --- a/src/tests/_internal/server/routers/test_projects.py +++ b/src/tests/_internal/server/routers/test_projects.py @@ -15,7 +15,6 @@ from dstack._internal.server.services.permissions import DefaultPermissions from dstack._internal.server.services.projects import add_project_member from dstack._internal.server.testing.common import ( - create_backend, create_fleet, create_project, create_repo, @@ -65,10 +64,6 @@ async def test_returns_projects(self, test_db, session: AsyncSession, client: As await add_project_member( session=session, project=project, user=user, project_role=ProjectRole.ADMIN ) - await create_backend( - session=session, - project_id=project.id, - ) response = await client.post("/api/projects/list", headers=get_auth_headers(user.token)) assert response.status_code in [200] assert response.json() == [ @@ -216,6 +211,181 @@ async def test_member_sees_both_public_and_private_projects( assert "public_project" in project_names assert "private_project" in project_names + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_paginated_projects( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user( + session=session, + created_at=datetime(2023, 1, 2, 3, 0, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, + ) + project1 = await create_project( + session=session, + name="project1", + owner=user, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + project2 = await create_project( + session=session, + name="project2", + owner=user, + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + project3 = await create_project( + session=session, + name="project3", + owner=user, + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + ) + response = await client.post( + "/api/projects/list", + headers=get_auth_headers(user.token), + json={"limit": 1}, + ) + assert response.status_code == 200 + assert response.json() == [ + { + "project_id": str(project3.id), + "project_name": project3.name, + "owner": { + "id": str(user.id), + "username": user.name, + "created_at": "2023-01-02T03:00:00+00:00", + "global_role": user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + }, + "created_at": "2023-01-02T03:06:00+00:00", + "backends": [], + "members": [], + "is_public": False, + } + ] + response = await client.post( + "/api/projects/list", + headers=get_auth_headers(user.token), + json={ + "prev_created_at": "2023-01-02T03:06:00+00:00", + "prev_id": str(project3.id), + "limit": 1, + }, + ) + assert response.status_code == 200 + assert response.json() == [ + { + "project_id": str(project2.id), + "project_name": project2.name, + "owner": { + "id": str(user.id), + "username": user.name, + "created_at": "2023-01-02T03:00:00+00:00", + "global_role": user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + }, + "created_at": "2023-01-02T03:05:00+00:00", + "backends": [], + "members": [], + "is_public": False, + } + ] + response = await client.post( + "/api/projects/list", + headers=get_auth_headers(user.token), + json={ + "prev_created_at": "2023-01-02T03:05:00+00:00", + "prev_id": str(project2.id), + "limit": 1, + }, + ) + assert response.status_code == 200 + assert response.json() == [ + { + "project_id": str(project1.id), + "project_name": project1.name, + "owner": { + "id": str(user.id), + "username": user.name, + "created_at": "2023-01-02T03:00:00+00:00", + "global_role": user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + }, + "created_at": "2023-01-02T03:04:00+00:00", + "backends": [], + "members": [], + "is_public": False, + } + ] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_total_count(self, test_db, session: AsyncSession, client: AsyncClient): + user = await create_user( + session=session, + created_at=datetime(2023, 1, 2, 3, 0, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, + ) + await create_project( + session=session, + name="project1", + owner=user, + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + ) + project3 = await create_project( + session=session, + name="project3", + owner=user, + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + ) + response = await client.post( + "/api/projects/list", + headers=get_auth_headers(user.token), + json={"limit": 1, "return_total_count": True}, + ) + assert response.status_code == 200 + assert response.json() == { + "total_count": 2, + "projects": [ + { + "project_id": str(project3.id), + "project_name": project3.name, + "owner": { + "id": str(user.id), + "username": user.name, + "created_at": "2023-01-02T03:00:00+00:00", + "global_role": user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + }, + "created_at": "2023-01-02T03:05:00+00:00", + "backends": [], + "members": [], + "is_public": False, + } + ], + } + + +class TestListOnlyNoFleets: @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_only_no_fleets_returns_projects_without_active_fleets( diff --git a/src/tests/_internal/server/routers/test_users.py b/src/tests/_internal/server/routers/test_users.py index 6c5b373a63..5042e75d6b 100644 --- a/src/tests/_internal/server/routers/test_users.py +++ b/src/tests/_internal/server/routers/test_users.py @@ -39,7 +39,7 @@ async def test_admins_see_all_non_deleted_users( admin = await create_user( session=session, name="admin", - created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), global_role=GlobalRole.ADMIN, ) other_user = await create_user( @@ -61,7 +61,7 @@ async def test_admins_see_all_non_deleted_users( { "id": str(admin.id), "username": admin.name, - "created_at": "2023-01-02T03:04:00+00:00", + "created_at": "2023-01-02T03:05:00+00:00", "global_role": admin.global_role, "email": None, "active": True, @@ -84,6 +84,162 @@ async def test_admins_see_all_non_deleted_users( }, ] + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_total_count(self, test_db, session: AsyncSession, client: AsyncClient): + admin = await create_user( + session=session, + name="admin", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, + ) + await create_user( + session=session, + name="user_one", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + await create_user( + session=session, + name="deleted_user", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + deleted=True, + ) + response = await client.post( + "/api/users/list", + headers=get_auth_headers(admin.token), + json={"limit": 1, "return_total_count": True}, + ) + assert response.status_code == 200 + assert response.json() == { + "total_count": 2, + "users": [ + { + "id": str(admin.id), + "username": admin.name, + "created_at": "2023-01-02T03:06:00+00:00", + "global_role": admin.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + } + ], + } + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_paginates_results(self, test_db, session: AsyncSession, client: AsyncClient): + admin = await create_user( + session=session, + name="admin", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, + ) + user_one = await create_user( + session=session, + name="user_one", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + await create_user( + session=session, + name="user_two", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + response = await client.post( + "/api/users/list", + headers=get_auth_headers(admin.token), + json={"limit": 1}, + ) + assert response.status_code == 200 + assert response.json() == [ + { + "id": str(admin.id), + "username": admin.name, + "created_at": "2023-01-02T03:06:00+00:00", + "global_role": admin.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + } + ] + response = await client.post( + "/api/users/list", + headers=get_auth_headers(admin.token), + json={ + "prev_created_at": "2023-01-02T03:06:00+00:00", + "prev_id": str(admin.id), + "limit": 1, + }, + ) + assert response.status_code == 200 + assert response.json() == [ + { + "id": str(user_one.id), + "username": user_one.name, + "created_at": "2023-01-02T03:05:00+00:00", + "global_role": user_one.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + } + ] + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_filters_by_name_pattern( + self, test_db, session: AsyncSession, client: AsyncClient + ): + admin = await create_user( + session=session, + name="admin", + created_at=datetime(2023, 1, 2, 3, 6, tzinfo=timezone.utc), + global_role=GlobalRole.ADMIN, + ) + matching_user = await create_user( + session=session, + name="alpha_user", + created_at=datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + await create_user( + session=session, + name="bravo", + created_at=datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc), + global_role=GlobalRole.USER, + ) + response = await client.post( + "/api/users/list", + headers=get_auth_headers(admin.token), + json={"name_pattern": "alpha"}, + ) + assert response.status_code == 200 + assert response.json() == [ + { + "id": str(matching_user.id), + "username": matching_user.name, + "created_at": "2023-01-02T03:05:00+00:00", + "global_role": matching_user.global_role, + "email": None, + "active": True, + "permissions": { + "can_create_projects": True, + }, + "ssh_public_key": None, + } + ] + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_non_admins_see_only_themselves( diff --git a/src/tests/api/common.py b/src/tests/api/common.py new file mode 100644 index 0000000000..c453b6afee --- /dev/null +++ b/src/tests/api/common.py @@ -0,0 +1,29 @@ +import json +from dataclasses import dataclass, field +from typing import Any, Optional + +import requests + + +@dataclass +class RequestRecorder: + payload: Any + last_path: Optional[str] = None + last_body: Optional[str] = None + last_kwargs: dict[str, Any] = field(default_factory=dict) + + def __call__( + self, + path: str, + body: Optional[str] = None, + raise_for_status: bool = True, + method: str = "POST", + **kwargs, + ) -> requests.Response: + self.last_path = path + self.last_body = body + self.last_kwargs = kwargs + resp = requests.Response() + resp.status_code = 200 + resp._content = json.dumps(self.payload).encode("utf-8") + return resp diff --git a/src/tests/api/test_projects.py b/src/tests/api/test_projects.py new file mode 100644 index 0000000000..38b93bae5a --- /dev/null +++ b/src/tests/api/test_projects.py @@ -0,0 +1,62 @@ +import json +import logging +from datetime import datetime, timezone +from uuid import UUID + +from dstack.api.server._projects import ProjectsAPIClient +from tests.api.common import RequestRecorder + +PROJECT_PAYLOAD = { + "project_id": "1b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", + "project_name": "p", + "owner": { + "id": "2b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e", + "username": "u", + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": "user", + "email": None, + "active": True, + "permissions": {"can_create_projects": True}, + "ssh_public_key": None, + }, + "created_at": "2023-01-02T03:04:00+00:00", + "backends": [], + "members": [], + "is_public": False, +} + + +class TestProjectsAPIClientList: + def test_projects_list_serializes_pagination_and_parses_info_list(self): + request = RequestRecorder(payload={"total_count": 1, "projects": [PROJECT_PAYLOAD]}) + client = ProjectsAPIClient(_request=request, _logger=logging.getLogger("test")) + dt = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + pid = UUID("3b0e1b45-2f8c-4ab6-8010-a0d1a3e44e0e") + + result = client.list( + return_total_count=True, + prev_created_at=dt, + name_pattern="p", + prev_id=pid, + limit=1, + ascending=True, + ) + + payload = json.loads(request.last_body) + assert request.last_path == "/api/projects/list" + assert payload["include_not_joined"] is True + assert payload["return_total_count"] is True + assert payload["name_pattern"] == "p" + assert payload["prev_created_at"] == dt.isoformat() + assert payload["prev_id"] == str(pid) + assert payload["limit"] == 1 + assert payload["ascending"] is True + assert result.total_count == 1 + assert result.projects[0].project_name == "p" + + def test_projects_list_parses_list_response(self): + request = RequestRecorder(payload=[PROJECT_PAYLOAD]) + client = ProjectsAPIClient(_request=request, _logger=logging.getLogger("test")) + result = client.list() + assert isinstance(result, list) + assert result[0].project_name == PROJECT_PAYLOAD["project_name"] diff --git a/src/tests/api/test_users.py b/src/tests/api/test_users.py new file mode 100644 index 0000000000..c01703b811 --- /dev/null +++ b/src/tests/api/test_users.py @@ -0,0 +1,54 @@ +import json +import logging +from datetime import datetime, timezone +from uuid import UUID + +from dstack.api.server._users import UsersAPIClient +from tests.api.common import RequestRecorder + +USER_PAYLOAD = { + "id": "11111111-1111-4111-8111-111111111111", + "username": "user", + "created_at": "2023-01-02T03:04:00+00:00", + "global_role": "user", + "email": None, + "active": True, + "permissions": {"can_create_projects": True}, + "ssh_public_key": None, +} + + +class TestUsersAPIClientList: + def test_serializes_pagination_and_parses_info_list(self): + recorder = RequestRecorder({"total_count": 1, "users": [USER_PAYLOAD]}) + client = UsersAPIClient(_request=recorder, _logger=logging.getLogger("test")) + dt = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + uid = UUID("22222222-2222-4222-8222-222222222222") + + result = client.list( + return_total_count=True, + name_pattern="user", + prev_created_at=dt, + prev_id=uid, + limit=1, + ascending=True, + ) + + payload = json.loads(recorder.last_body) + assert recorder.last_path == "/api/users/list" + assert payload["return_total_count"] is True + assert payload["name_pattern"] == "user" + assert payload["prev_created_at"] == dt.isoformat() + assert payload["prev_id"] == str(uid) + assert payload["limit"] == 1 + assert payload["ascending"] is True + assert result.total_count == 1 + assert result.users[0].username == "user" + + def test_parses_list_response(self): + recorder = RequestRecorder([USER_PAYLOAD]) + client = UsersAPIClient(_request=recorder, _logger=logging.getLogger("test")) + result = client.list() + + assert isinstance(result, list) + assert result[0].username == "user" From 1c3b7f8078de3955841be1611bb4d3b691b0614b Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 23 Jan 2026 14:02:11 +0500 Subject: [PATCH 072/187] Update dstack server CLI logo (#3438) * Update dstack server CLI logo * Remove extra padding --- src/dstack/_internal/server/app.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/dstack/_internal/server/app.py b/src/dstack/_internal/server/app.py index b41152c149..dbea6f777b 100644 --- a/src/dstack/_internal/server/app.py +++ b/src/dstack/_internal/server/app.py @@ -419,16 +419,11 @@ def _sentry_traces_sampler(sampling_context: SamplingContext) -> float: def _print_dstack_logo(): console.print( - """[purple]╱╱╭╮╱╱╭╮╱╱╱╱╱╱╭╮ -╱╱┃┃╱╭╯╰╮╱╱╱╱╱┃┃ -╭━╯┣━┻╮╭╋━━┳━━┫┃╭╮ -┃╭╮┃━━┫┃┃╭╮┃╭━┫╰╯╯ -┃╰╯┣━━┃╰┫╭╮┃╰━┫╭╮╮ -╰━━┻━━┻━┻╯╰┻━━┻╯╰╯ -╭━━┳━━┳━┳╮╭┳━━┳━╮ -┃━━┫┃━┫╭┫╰╯┃┃━┫╭╯ -┣━━┃┃━┫┃╰╮╭┫┃━┫┃ -╰━━┻━━┻╯╱╰╯╰━━┻╯ + r"""[purple] _ _ _ + __| |___| |_ __ _ ___| | __ ___ ___ _ ____ _____ _ __ + / _` / __| __/ _` |/ __| |/ / / __|/ _ \ '__\ \ / / _ \ '__| +| (_| \__ \ || (_| | (__| < \__ \ __/ | \ V / __/ | + \__,_|___/\__\__,_|\___|_|\_\ |___/\___|_| \_/ \___|_| [/]""" ) From 330eb1f0ba9b1b05ab3953303d6e0aeb8540d437 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 23 Jan 2026 16:33:25 +0500 Subject: [PATCH 073/187] Move pytest.ini options to pyproject.toml (#3491) * Move pytest.ini options to pyproject.toml * Fix test_logs * Fix decorator order * Fix TestRESTPlugin --- pyproject.toml | 24 +++++++- pytest.ini | 12 ---- src/tests/_internal/server/conftest.py | 2 +- .../_internal/server/services/test_logs.py | 2 +- src/tests/plugins/test_rest_plugin.py | 57 +++++++++---------- 5 files changed, 53 insertions(+), 44 deletions(-) delete mode 100644 pytest.ini diff --git a/pyproject.toml b/pyproject.toml index 2fe97f2cbb..8e4e7bc070 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,11 +96,33 @@ ignore = [ "src/dstack/_internal/server/migrations/versions", ] +[tool.pytest.ini_options] +testpaths = ["src/tests"] +addopts = [ + "--disable-socket", + "--allow-hosts=127.0.0.1,localhost", + # unix socket for Docker/testcontainers + "--allow-unix-socket", +] +markers = [ + "shim_version", + "dockerized", +] +env = [ + "DSTACK_CLI_RICH_FORCE_TERMINAL=0", +] +filterwarnings = [ + # testcontainers modules use deprecated decorators – nothing we can do: + # https://github.com/testcontainers/testcontainers-python/issues/874 + "ignore:^The @wait_container_is_ready decorator:DeprecationWarning" +] + [dependency-groups] dev = [ "pre-commit>=4.2.0", - "pytest~=7.2", + "pytest~=8.0", "pytest-asyncio>=0.23.8", + "pytest-mock>=3.14.0", "pytest-httpbin>=2.1.0", "pytest-socket>=0.7.0", "pytest-env>=1.1.0", diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 30c0e62811..0000000000 --- a/pytest.ini +++ /dev/null @@ -1,12 +0,0 @@ -[pytest] -testpaths = src/tests -addopts = - --disable-socket - --allow-hosts=127.0.0.1,localhost - ; unix socket for Docker/testcontainers - --allow-unix-socket -markers = - shim_version - dockerized -env = - DSTACK_CLI_RICH_FORCE_TERMINAL=0 diff --git a/src/tests/_internal/server/conftest.py b/src/tests/_internal/server/conftest.py index 623723f07c..9bb508c5d6 100644 --- a/src/tests/_internal/server/conftest.py +++ b/src/tests/_internal/server/conftest.py @@ -13,7 +13,7 @@ @pytest.fixture -def client(event_loop): +def client(): transport = httpx.ASGITransport(app=app) return httpx.AsyncClient(transport=transport, base_url="http://test") diff --git a/src/tests/_internal/server/services/test_logs.py b/src/tests/_internal/server/services/test_logs.py index 892ed2d77b..06bfca7dea 100644 --- a/src/tests/_internal/server/services/test_logs.py +++ b/src/tests/_internal/server/services/test_logs.py @@ -798,8 +798,8 @@ async def test_poll_logs_descending_malformed_lines( class TestCloudWatchLogStorage: FAKE_NOW = datetime(2023, 10, 6, 10, 1, 54, tzinfo=timezone.utc) - @freeze_time(FAKE_NOW) @pytest_asyncio.fixture + @freeze_time(FAKE_NOW) async def project(self, test_db, session: AsyncSession) -> ProjectModel: project = await create_project(session=session, name="test-proj") return project diff --git a/src/tests/plugins/test_rest_plugin.py b/src/tests/plugins/test_rest_plugin.py index 0b725ff95b..7d9e35a51d 100644 --- a/src/tests/plugins/test_rest_plugin.py +++ b/src/tests/plugins/test_rest_plugin.py @@ -1,7 +1,6 @@ import json import os from contextlib import nullcontext as does_not_raise -from unittest import mock from unittest.mock import Mock import pytest @@ -101,14 +100,14 @@ async def test_on_run_apply_plugin_service_uri_not_set(self): CustomApplyPolicy() @pytest.mark.asyncio - @mock.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @pytest.mark.parametrize( "spec", ["run_spec", "fleet_spec", "volume_spec", "gateway_spec"], indirect=True ) async def test_on_apply_plugin_service_returns_mutated_spec( - self, test_db, user, project, spec + self, mocker, test_db, user, project, spec ): + mocker.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) policy = CustomApplyPolicy() mock_response = Mock() response_dict = {"spec": spec.dict(), "error": None} @@ -120,55 +119,54 @@ async def test_on_apply_plugin_service_returns_mutated_spec( mock_response.text = json.dumps(response_dict) mock_response.raise_for_status = Mock() - with mock.patch("requests.post", return_value=mock_response): - result = policy.on_apply(user=user.name, project=project.name, spec=spec) - assert result == type(spec)(**response_dict["spec"]) + mocker.patch("requests.post", return_value=mock_response) + result = policy.on_apply(user=user.name, project=project.name, spec=spec) + assert result == type(spec)(**response_dict["spec"]) @pytest.mark.asyncio - @mock.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @pytest.mark.parametrize( "spec", ["run_spec", "fleet_spec", "volume_spec", "gateway_spec"], indirect=True ) - async def test_on_apply_plugin_service_call_fails(self, test_db, user, project, spec): + async def test_on_apply_plugin_service_call_fails(self, mocker, test_db, user, project, spec): + mocker.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) policy = CustomApplyPolicy() - with mock.patch("requests.post", side_effect=requests.RequestException("fail")): - with pytest.raises(ServerClientError): - policy.on_apply(user=user.name, project=project.name, spec=spec) + mocker.patch("requests.post", side_effect=requests.RequestException("fail")) + with pytest.raises(ServerClientError): + policy.on_apply(user=user.name, project=project.name, spec=spec) @pytest.mark.asyncio - @mock.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @pytest.mark.parametrize( "spec", ["run_spec", "fleet_spec", "volume_spec", "gateway_spec"], indirect=True ) - async def test_on_apply_plugin_service_connection_fails(self, test_db, user, project, spec): + async def test_on_apply_plugin_service_connection_fails( + self, mocker, test_db, user, project, spec + ): + mocker.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) policy = CustomApplyPolicy() - with mock.patch( - "requests.post", side_effect=requests.ConnectionError("Failed to connect") - ): - with pytest.raises(ServerClientError): - policy.on_apply(user=user.name, project=project.name, spec=spec) + mocker.patch("requests.post", side_effect=requests.ConnectionError("Failed to connect")) + with pytest.raises(ServerClientError): + policy.on_apply(user=user.name, project=project.name, spec=spec) @pytest.mark.asyncio - @mock.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @pytest.mark.parametrize( "spec", ["run_spec", "fleet_spec", "volume_spec", "gateway_spec"], indirect=True ) async def test_on_apply_plugin_service_returns_invalid_spec( - self, test_db, user, project, spec + self, mocker, test_db, user, project, spec ): + mocker.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) policy = CustomApplyPolicy() mock_response = Mock() mock_response.text = json.dumps({"invalid-key": "abc"}) mock_response.raise_for_status = Mock() - with mock.patch("requests.post", return_value=mock_response): - with pytest.raises(ServerClientError): - policy.on_apply(user.name, project=project.name, spec=spec) + mocker.patch("requests.post", return_value=mock_response) + with pytest.raises(ServerClientError): + policy.on_apply(user.name, project=project.name, spec=spec) @pytest.mark.asyncio - @mock.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @pytest.mark.parametrize( "spec", ["run_spec", "fleet_spec", "volume_spec", "gateway_spec"], indirect=True @@ -194,14 +192,15 @@ async def test_on_apply_plugin_service_returns_invalid_spec( ], ) async def test_on_apply_plugin_service_error_handling( - self, test_db, user, project, spec, error, expectation + self, mocker, test_db, user, project, spec, error, expectation ): + mocker.patch.dict(os.environ, {PLUGIN_SERVICE_URI_ENV_VAR_NAME: "http://mock"}) policy = CustomApplyPolicy() mock_response = Mock() response_dict = {"spec": spec.dict(), "error": error} mock_response.text = json.dumps(response_dict) mock_response.raise_for_status = Mock() - with mock.patch("requests.post", return_value=mock_response): - with expectation: - result = policy.on_apply(user=user.name, project=project.name, spec=spec) - assert result == type(spec)(**response_dict["spec"]) + mocker.patch("requests.post", return_value=mock_response) + with expectation: + result = policy.on_apply(user=user.name, project=project.name, spec=spec) + assert result == type(spec)(**response_dict["spec"]) From 802c450ad4a34f7352d8be29afa6847c5b84be60 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Fri, 23 Jan 2026 15:52:18 +0100 Subject: [PATCH 074/187] [UX] Make `dstack project` and `dstack project set-default` interactive for default project selection (#3488) --- pyproject.toml | 2 +- src/dstack/_internal/cli/commands/project.py | 118 +++++++++++++++++-- 2 files changed, 107 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8e4e7bc070..37588341e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "rich", "rich-argparse", "tqdm", - "simple-term-menu", + "questionary>=2.0.1", "pydantic>=1.10.10,<2.0.0", "pydantic-duality>=1.2.4", "websocket-client", diff --git a/src/dstack/_internal/cli/commands/project.py b/src/dstack/_internal/cli/commands/project.py index 0f6e5b4db5..db4a7a5eb9 100644 --- a/src/dstack/_internal/cli/commands/project.py +++ b/src/dstack/_internal/cli/commands/project.py @@ -1,19 +1,77 @@ import argparse -from typing import Any, Union +import sys +from typing import Any, Optional, Union from requests import HTTPError from rich.table import Table +try: + import questionary + + is_project_menu_supported = sys.stdin.isatty() +except (ImportError, NotImplementedError, AttributeError): + is_project_menu_supported = False + import dstack.api.server from dstack._internal.cli.commands import BaseCommand from dstack._internal.cli.utils.common import add_row_from_dict, confirm_ask, console from dstack._internal.core.errors import ClientError, CLIError +from dstack._internal.core.models.config import ProjectConfig from dstack._internal.core.services.configs import ConfigManager from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) +def select_default_project( + project_configs: list[ProjectConfig], default_project: Optional[ProjectConfig] +) -> Optional[ProjectConfig]: + """Show an interactive menu to select a default project. + + This method only prompts for selection and does not update the configuration. + Use `ConfigManager.configure_project()` and `ConfigManager.save()` to persist + the selected project as default. + + Args: + project_configs: Non-empty list of available project configurations. + default_project: Currently default project, if any. + + Returns: + Selected project configuration, or None if cancelled. + + Raises: + CLIError: If `is_project_menu_supported` is False or `project_configs` is empty. + """ + if not is_project_menu_supported: + raise CLIError("Interactive menu is not supported on this platform") + + if len(project_configs) == 0: + raise CLIError("No projects configured") + + menu_entries = [] + default_index = None + for i, project_config in enumerate(project_configs): + is_default = project_config.name == default_project.name if default_project else False + entry = f"{project_config.name} ({project_config.url})" + if is_default: + default_index = i + menu_entries.append((entry, i)) + + choices = [questionary.Choice(title=entry, value=index) for entry, index in menu_entries] # pyright: ignore[reportPossiblyUnboundVariable] + default_value = default_index + selected_index = questionary.select( # pyright: ignore[reportPossiblyUnboundVariable] + message="Select the default project:", + choices=choices, + default=default_value, # pyright: ignore[reportArgumentType] + qmark="", + instruction="(↑↓ Enter)", + ).ask() + + if selected_index is not None and isinstance(selected_index, int): + return project_configs[selected_index] + return None + + class ProjectCommand(BaseCommand): NAME = "project" DESCRIPTION = "Manage projects configs" @@ -67,14 +125,17 @@ def _register(self): # Set default subcommand set_default_parser = subparsers.add_parser("set-default", help="Set default project") set_default_parser.add_argument( - "name", type=str, help="The name of the project to set as default" + "name", + type=str, + nargs="?" if is_project_menu_supported else None, + help="The name of the project to set as default", ) set_default_parser.set_defaults(subfunc=self._set_default) def _command(self, args: argparse.Namespace): super()._command(args) if not hasattr(args, "subfunc"): - args.subfunc = self._list + args.subfunc = self._project args.subfunc(args) def _add(self, args: argparse.Namespace): @@ -156,14 +217,47 @@ def _list(self, args: argparse.Namespace): console.print(table) + def _project(self, args: argparse.Namespace): + if is_project_menu_supported and not getattr(args, "verbose", False): + config_manager = ConfigManager() + project_configs = config_manager.list_project_configs() + default_project = config_manager.get_project_config() + selected_project = select_default_project(project_configs, default_project) + if selected_project is not None: + config_manager.configure_project( + name=selected_project.name, + url=selected_project.url, + token=selected_project.token, + default=True, + ) + config_manager.save() + console.print("[grey58]OK[/]") + else: + self._list(args) + def _set_default(self, args: argparse.Namespace): - config_manager = ConfigManager() - project_config = config_manager.get_project_config(args.name) - if project_config is None: - raise CLIError(f"Project '{args.name}' not found") + if args.name: + config_manager = ConfigManager() + project_config = config_manager.get_project_config(args.name) + if project_config is None: + raise CLIError(f"Project '{args.name}' not found") - config_manager.configure_project( - name=args.name, url=project_config.url, token=project_config.token, default=True - ) - config_manager.save() - console.print("[grey58]OK[/]") + config_manager.configure_project( + name=args.name, url=project_config.url, token=project_config.token, default=True + ) + config_manager.save() + console.print("[grey58]OK[/]") + else: + config_manager = ConfigManager() + project_configs = config_manager.list_project_configs() + default_project = config_manager.get_project_config() + selected_project = select_default_project(project_configs, default_project) + if selected_project is not None: + config_manager.configure_project( + name=selected_project.name, + url=selected_project.url, + token=selected_project.token, + default=True, + ) + config_manager.save() + console.print("[grey58]OK[/]") From bd2d485f73bee893560f832e3459b39bd7e6d3a2 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Fri, 23 Jan 2026 22:36:02 +0545 Subject: [PATCH 075/187] Add replica groups in dstack-service (#3408) * Add replica groups in dstack-service add_replica_groups_model Replica Groups AutoScaling Rolling deployment and UI Replica Groups implementation clean up * Resolve Merge Conflict & Rename replica_groups to replicas * Resolve pyright type check * Rename replicas to count and make replica names optional * Resolve review comments on probes and rate limits * Resolve tests * Transform to ReplicaGroup in the replica_groups property * Resolve review comments * Resolve test_runs * Resolved major comments * Remove create_group_run_spec and use Job Configurator instead * Resolve Minor Issues * Resolve Minor Issues - Additional fixes * Resolve conflict with master branch * Resolve Major Comments * Resolve some minor comments * Resolve some minor comments --------- Co-authored-by: Bihan Rana --- src/dstack/_internal/cli/utils/run.py | 66 ++++- .../_internal/core/models/configurations.py | 236 ++++++++++++++++-- src/dstack/_internal/core/models/runs.py | 2 + .../server/background/tasks/process_runs.py | 207 +++++++++++++-- ...a7d_add_runmodel_desired_replica_counts.py | 26 ++ src/dstack/_internal/server/models.py | 2 +- .../server/services/jobs/__init__.py | 23 +- .../services/jobs/configurators/base.py | 12 +- .../server/services/jobs/configurators/dev.py | 6 +- .../services/jobs/configurators/service.py | 3 + .../server/services/runs/__init__.py | 85 +++++-- .../_internal/server/services/runs/plan.py | 148 +++++++---- .../server/services/runs/replicas.py | 225 ++++++++++++++--- .../_internal/server/services/runs/spec.py | 13 +- .../server/services/services/__init__.py | 28 ++- .../server/services/services/autoscalers.py | 29 +-- .../_internal/server/routers/test_runs.py | 2 + 17 files changed, 929 insertions(+), 184 deletions(-) create mode 100644 src/dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py diff --git a/src/dstack/_internal/cli/utils/run.py b/src/dstack/_internal/cli/utils/run.py index dec354e984..588f74bc69 100644 --- a/src/dstack/_internal/cli/utils/run.py +++ b/src/dstack/_internal/cli/utils/run.py @@ -285,16 +285,38 @@ def _format_job_name( show_deployment_num: bool, show_replica: bool, show_job: bool, + group_index: Optional[int] = None, + last_shown_group_index: Optional[int] = None, ) -> str: name_parts = [] + prefix = "" if show_replica: - name_parts.append(f"replica={job.job_spec.replica_num}") + # Show group information if replica groups are used + if group_index is not None: + # Show group=X replica=Y when group changes, or just replica=Y when same group + if group_index != last_shown_group_index: + # First job in group: use 3 spaces indent + prefix = " " + name_parts.append(f"group={group_index} replica={job.job_spec.replica_num}") + else: + # Subsequent job in same group: align "replica=" with first job's "replica=" + # Calculate padding: width of " group={last_shown_group_index} " + padding_width = 3 + len(f"group={last_shown_group_index}") + 1 + prefix = " " * padding_width + name_parts.append(f"replica={job.job_spec.replica_num}") + else: + # Legacy behavior: no replica groups + prefix = " " + name_parts.append(f"replica={job.job_spec.replica_num}") + else: + prefix = " " + if show_job: name_parts.append(f"job={job.job_spec.job_num}") name_suffix = ( f" deployment={latest_job_submission.deployment_num}" if show_deployment_num else "" ) - name_value = " " + (" ".join(name_parts) if name_parts else "") + name_value = prefix + (" ".join(name_parts) if name_parts else "") name_value += name_suffix return name_value @@ -363,6 +385,17 @@ def get_runs_table( ) merge_job_rows = len(run.jobs) == 1 and not show_deployment_num + group_name_to_index: Dict[str, int] = {} + if run.run_spec.configuration.type == "service" and hasattr( + run.run_spec.configuration, "replica_groups" + ): + replica_groups = run.run_spec.configuration.replica_groups + if replica_groups: + for idx, group in enumerate(replica_groups): + assert group.name is not None, "Group name is always set" + group_name = group.name + group_name_to_index[group_name] = idx + run_row: Dict[Union[str, int], Any] = { "NAME": _format_run_name(run, show_deployment_num), "SUBMITTED": format_date(run.submitted_at), @@ -376,13 +409,35 @@ def get_runs_table( if not merge_job_rows: add_row_from_dict(table, run_row) - for job in run.jobs: + # Sort jobs by group index first, then by replica_num within each group + def get_job_sort_key(job: Job) -> tuple: + group_index = None + if group_name_to_index: + group_index = group_name_to_index.get(job.job_spec.replica_group) + # Use a large number for jobs without groups to put them at the end + return (group_index if group_index is not None else 999999, job.job_spec.replica_num) + + sorted_jobs = sorted(run.jobs, key=get_job_sort_key) + + last_shown_group_index: Optional[int] = None + for job in sorted_jobs: latest_job_submission = job.job_submissions[-1] status_formatted = _format_job_submission_status(latest_job_submission, verbose) + # Get group index for this job + group_index: Optional[int] = None + if group_name_to_index: + group_index = group_name_to_index.get(job.job_spec.replica_group) + job_row: Dict[Union[str, int], Any] = { "NAME": _format_job_name( - job, latest_job_submission, show_deployment_num, show_replica, show_job + job, + latest_job_submission, + show_deployment_num, + show_replica, + show_job, + group_index=group_index, + last_shown_group_index=last_shown_group_index, ), "STATUS": status_formatted, "PROBES": _format_job_probes( @@ -394,6 +449,9 @@ def get_runs_table( "GPU": "-", "PRICE": "-", } + # Update last shown group index for next iteration + if group_index is not None: + last_shown_group_index = group_index jpd = latest_job_submission.job_provisioning_data if jpd is not None: shared_offer: Optional[InstanceOfferWithAvailability] = None diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 4558aebb11..6bd9f0827e 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -31,6 +31,7 @@ from dstack._internal.core.models.services import AnyModel, OpenAIChatModel from dstack._internal.core.models.unix import UnixUser from dstack._internal.core.models.volumes import MountPoint, VolumeConfiguration, parse_mount_point +from dstack._internal.core.services import is_valid_dstack_resource_name from dstack._internal.utils.common import has_duplicates, list_enum_values_for_annotation from dstack._internal.utils.json_schema import add_extra_schema_types from dstack._internal.utils.json_utils import ( @@ -54,6 +55,7 @@ DEFAULT_PROBE_READY_AFTER = 1 DEFAULT_PROBE_METHOD = "get" MAX_PROBE_URL_LEN = 2048 +DEFAULT_REPLICA_GROUP_NAME = "default" class RunConfigurationType(str, Enum): @@ -612,6 +614,11 @@ class ConfigurationWithCommandsParams(CoreModel): @root_validator def check_image_or_commands_present(cls, values): + # If replicas is list, skip validation - commands come from replica groups + replicas = values.get("replicas") + if isinstance(replicas, list): + return values + if not values.get("commands") and not values.get("image"): raise ValueError("Either `commands` or `image` must be set") return values @@ -734,6 +741,68 @@ def schema_extra(schema: Dict[str, Any]): ) +def _validate_replica_range(v: Range[int]) -> Range[int]: + """Validate a Range[int] used for replica counts.""" + if v.max is None: + raise ValueError("The maximum number of replicas is required") + if v.min is None: + v.min = 0 + if v.min < 0: + raise ValueError("The minimum number of replicas must be greater than or equal to 0") + return v + + +class ReplicaGroup(CoreModel): + name: Annotated[ + Optional[str], + Field( + description="The name of the replica group. If not provided, defaults to 'replica-group-0', 'replica-group-1', etc. based on position." + ), + ] + count: Annotated[ + Range[int], + Field( + description="The number of replicas. Can be a number (e.g. `2`) or a range (`0..4` or `1..8`). " + "If it's a range, the `scaling` property is required" + ), + ] + scaling: Annotated[ + Optional[ScalingSpec], + Field(description="The auto-scaling rules. Required if `count` is set to a range"), + ] = None + + resources: Annotated[ + ResourcesSpec, + Field(description="The resources requirements for replicas in this group"), + ] = ResourcesSpec() + + commands: Annotated[ + CommandsList, + Field(description="The shell commands to run for replicas in this group"), + ] = [] + + @validator("name") + def validate_name(cls, v: Optional[str]) -> Optional[str]: + if v is not None: + if not is_valid_dstack_resource_name(v): + raise ValueError("Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'") + return v + + @validator("count") + def convert_count(cls, v: Range[int]) -> Range[int]: + return _validate_replica_range(v) + + @root_validator() + def validate_scaling(cls, values): + scaling = values.get("scaling") + count = values.get("count") + if count and count.min != count.max and not scaling: + raise ValueError("When you set `count` to a range, ensure to specify `scaling`.") + if count and count.min == count.max and scaling: + raise ValueError("To use `scaling`, `count` must be set to a range.") + return values + + class ServiceConfigurationParams(CoreModel): port: Annotated[ # NOTE: it's a PortMapping for historical reasons. Only `port.container_port` is used. @@ -775,13 +844,7 @@ class ServiceConfigurationParams(CoreModel): SERVICE_HTTPS_DEFAULT ) auth: Annotated[bool, Field(description="Enable the authorization")] = True - replicas: Annotated[ - Range[int], - Field( - description="The number of replicas. Can be a number (e.g. `2`) or a range (`0..4` or `1..8`). " - "If it's a range, the `scaling` property is required" - ), - ] = Range[int](min=1, max=1) + scaling: Annotated[ Optional[ScalingSpec], Field(description="The auto-scaling rules. Required if `replicas` is set to a range"), @@ -792,6 +855,20 @@ class ServiceConfigurationParams(CoreModel): Field(description="List of probes used to determine job health"), ] = [] + replicas: Annotated[ + Optional[Union[List[ReplicaGroup], Range[int]]], + Field( + description=( + "The number of replicas or a list of replica groups. " + "Can be an integer (e.g., `2`), a range (e.g., `0..4`), or a list of replica groups. " + "Each replica group defines replicas with shared configuration " + "(commands, resources, scaling). " + "When `replicas` is a list of replica groups, top-level `scaling`, `commands`, " + "and `resources` are not allowed and must be specified in each replica group instead. " + ) + ), + ] = None + @validator("port") def convert_port(cls, v) -> PortMapping: if isinstance(v, int): @@ -806,26 +883,6 @@ def convert_model(cls, v: Optional[Union[AnyModel, str]]) -> Optional[AnyModel]: return OpenAIChatModel(type="chat", name=v, format="openai") return v - @validator("replicas") - def convert_replicas(cls, v: Range[int]) -> Range[int]: - if v.max is None: - raise ValueError("The maximum number of replicas is required") - if v.min is None: - v.min = 0 - if v.min < 0: - raise ValueError("The minimum number of replicas must be greater than or equal to 0") - return v - - @root_validator() - def validate_scaling(cls, values): - scaling = values.get("scaling") - replicas = values.get("replicas") - if replicas and replicas.min != replicas.max and not scaling: - raise ValueError("When you set `replicas` to a range, ensure to specify `scaling`.") - if replicas and replicas.min == replicas.max and scaling: - raise ValueError("To use `scaling`, `replicas` must be set to a range.") - return values - @validator("rate_limits") def validate_rate_limits(cls, v: list[RateLimit]) -> list[RateLimit]: counts = Counter(limit.prefix for limit in v) @@ -847,6 +904,103 @@ def validate_probes(cls, v: list[ProbeConfig]) -> list[ProbeConfig]: raise ValueError("Probes must be unique") return v + @validator("replicas") + def validate_replicas( + cls, v: Optional[Union[Range[int], List[ReplicaGroup]]] + ) -> Optional[Union[Range[int], List[ReplicaGroup]]]: + if v is None: + return v + if isinstance(v, Range): + return _validate_replica_range(v) + + if isinstance(v, list): + if not v: + raise ValueError("`replicas` cannot be an empty list") + + # Assign default names to groups without names + for index, group in enumerate(v): + if group.name is None: + group.name = f"replica-group-{index}" + + # Check for duplicate names + names = [group.name for group in v] + if len(names) != len(set(names)): + duplicates = [name for name in set(names) if names.count(name) > 1] + raise ValueError( + f"Duplicate replica group names found: {duplicates}. " + "Each replica group must have a unique name." + ) + return v + + @root_validator() + def validate_scaling(cls, values): + scaling = values.get("scaling") + replicas = values.get("replicas") + + if isinstance(replicas, Range): + if replicas and replicas.min != replicas.max and not scaling: + raise ValueError( + "When you set `replicas` to a range, ensure to specify `scaling`." + ) + if replicas and replicas.min == replicas.max and scaling: + raise ValueError("To use `scaling`, `replicas` must be set to a range.") + return values + + @root_validator() + def validate_top_level_properties_with_replica_groups(cls, values): + """ + When replicas is a list of ReplicaGroup, forbid top-level scaling, commands, and resources + """ + replicas = values.get("replicas") + + if not isinstance(replicas, list): + return values + + scaling = values.get("scaling") + if scaling is not None: + raise ValueError( + "Top-level `scaling` is not allowed when `replicas` is a list. " + "Specify `scaling` in each replica group instead." + ) + + commands = values.get("commands", []) + if commands: + raise ValueError( + "Top-level `commands` is not allowed when `replicas` is a list. " + "Specify `commands` in each replica group instead." + ) + + resources = values.get("resources") + + default_resources = ResourcesSpec() + if resources and resources.dict() != default_resources.dict(): + raise ValueError( + "Top-level `resources` is not allowed when `replicas` is a list. " + "Specify `resources` in each replica group instead." + ) + + return values + + @root_validator() + def validate_replica_groups_have_commands_or_image(cls, values): + """ + When replicas is a list, ensure each ReplicaGroup has commands OR service has image. + """ + replicas = values.get("replicas") + image = values.get("image") + + if not isinstance(replicas, list): + return values + + for group in replicas: + if not group.commands and not image: + raise ValueError( + f"Replica group '{group.name}' has no commands. " + "Either set `commands` in the replica group or set `image` at the service level." + ) + + return values + class ServiceConfigurationConfig( ProfileParamsConfig, @@ -869,6 +1023,34 @@ class ServiceConfiguration( ): type: Literal["service"] = "service" + @property + def replica_groups(self) -> List[ReplicaGroup]: + if self.replicas is None: + return [ + ReplicaGroup( + name=DEFAULT_REPLICA_GROUP_NAME, + count=Range[int](min=1, max=1), + commands=self.commands, + resources=self.resources, + scaling=self.scaling, + ) + ] + if isinstance(self.replicas, list): + return self.replicas + if isinstance(self.replicas, Range): + return [ + ReplicaGroup( + name=DEFAULT_REPLICA_GROUP_NAME, + count=self.replicas, + commands=self.commands, + resources=self.resources, + scaling=self.scaling, + ) + ] + raise ValueError( + f"Invalid replicas type: {type(self.replicas)}. Expected None, Range[int], or List[ReplicaGroup]" + ) + AnyRunConfiguration = Union[DevEnvironmentConfiguration, TaskConfiguration, ServiceConfiguration] diff --git a/src/dstack/_internal/core/models/runs.py b/src/dstack/_internal/core/models/runs.py index a966bc34a0..27c6f430c1 100644 --- a/src/dstack/_internal/core/models/runs.py +++ b/src/dstack/_internal/core/models/runs.py @@ -17,6 +17,7 @@ ) from dstack._internal.core.models.configurations import ( DEFAULT_PROBE_METHOD, + DEFAULT_REPLICA_GROUP_NAME, LEGACY_REPO_DIR, AnyRunConfiguration, HTTPHeaderSpec, @@ -253,6 +254,7 @@ class JobSpec(CoreModel): job_num: int job_name: str jobs_per_replica: int = 1 # default value for backward compatibility + replica_group: str = DEFAULT_REPLICA_GROUP_NAME app_specs: Optional[List[AppSpec]] user: Optional[UnixUser] = None # default value for backward compatibility commands: List[str] diff --git a/src/dstack/_internal/server/background/tasks/process_runs.py b/src/dstack/_internal/server/background/tasks/process_runs.py index ad42e7ed40..ee907e519b 100644 --- a/src/dstack/_internal/server/background/tasks/process_runs.py +++ b/src/dstack/_internal/server/background/tasks/process_runs.py @@ -1,5 +1,6 @@ import asyncio import datetime +import json from typing import List, Optional, Set, Tuple from sqlalchemy import and_, func, or_, select @@ -8,6 +9,7 @@ import dstack._internal.server.services.services.autoscalers as autoscalers from dstack._internal.core.errors import ServerError +from dstack._internal.core.models.configurations import ReplicaGroup from dstack._internal.core.models.profiles import RetryEvent, StopCriteria from dstack._internal.core.models.runs import ( Job, @@ -45,9 +47,14 @@ switch_run_status, ) from dstack._internal.server.services.runs.replicas import ( + build_replica_lists, + has_out_of_date_replicas, is_replica_registered, + job_belongs_to_group, retry_run_replica_jobs, + scale_down_replicas, scale_run_replicas, + scale_run_replicas_per_group, ) from dstack._internal.server.services.secrets import get_project_secrets_mapping from dstack._internal.server.services.services import update_service_desired_replica_count @@ -234,9 +241,10 @@ async def _process_pending_run(session: AsyncSession, run_model: RunModel): logger.debug("%s: retrying run is not yet ready for resubmission", fmt(run_model)) return - run_model.desired_replica_count = 1 if run.run_spec.configuration.type == "service": - run_model.desired_replica_count = run.run_spec.configuration.replicas.min or 0 + run_model.desired_replica_count = sum( + group.count.min or 0 for group in run.run_spec.configuration.replica_groups + ) await update_service_desired_replica_count( session, run_model, @@ -245,12 +253,18 @@ async def _process_pending_run(session: AsyncSession, run_model: RunModel): last_scaled_at=None, ) - if run_model.desired_replica_count == 0: - # stay zero scaled - return + if run_model.desired_replica_count == 0: + # stay zero scaled + return + + replicas: List[ReplicaGroup] = run.run_spec.configuration.replica_groups + + await scale_run_replicas_per_group(session, run_model, replicas) + else: + run_model.desired_replica_count = 1 + await scale_run_replicas(session, run_model, replicas_diff=run_model.desired_replica_count) - await scale_run_replicas(session, run_model, replicas_diff=run_model.desired_replica_count) - switch_run_status(session, run_model, RunStatus.SUBMITTED) + switch_run_status(session=session, run_model=run_model, new_status=RunStatus.SUBMITTED) def _retrying_run_ready_for_resubmission(run_model: RunModel, run: Run) -> bool: @@ -486,9 +500,60 @@ async def _handle_run_replicas( # FIXME: should only include scaling events, not retries and deployments last_scaled_at=max((r.timestamp for r in replicas_info), default=None), ) + replicas: List[ReplicaGroup] = run_spec.configuration.replica_groups + assert replicas, "replica groups should always return at least one group" + + await scale_run_replicas_per_group(session, run_model, replicas) + + # Handle per-group rolling deployment + await _update_jobs_to_new_deployment_in_place( + session=session, + run_model=run_model, + run_spec=run_spec, + replicas=replicas, + ) + # Process per-group rolling deployment + for group in replicas: + await _handle_rolling_deployment_for_group( + session=session, run_model=run_model, group=group, run_spec=run_spec + ) + # Terminate replicas from groups that were removed from the configuration + existing_group_names = set() + for job in run_model.jobs: + if job.status.is_finished(): + continue + try: + job_spec = JobSpec.__response__.parse_raw(job.job_spec_data) + existing_group_names.add(job_spec.replica_group) + except Exception: + continue + new_group_names = {group.name for group in replicas} + removed_group_names = existing_group_names - new_group_names + for removed_group_name in removed_group_names: + # Build replica lists for this removed group + active_replicas, inactive_replicas = build_replica_lists( + run_model=run_model, + group_filter=removed_group_name, + ) + + total_replicas = len(active_replicas) + len(inactive_replicas) + if total_replicas > 0: + logger.info( + "%s: terminating %d replica(s) from removed group '%s'", + fmt(run_model), + total_replicas, + removed_group_name, + ) + # Terminate all active replicas in the removed group + if active_replicas: + scale_down_replicas(session, active_replicas, len(active_replicas)) + # Terminate all inactive replicas in the removed group + if inactive_replicas: + scale_down_replicas(session, inactive_replicas, len(inactive_replicas)) + return max_replica_count = run_model.desired_replica_count - if _has_out_of_date_replicas(run_model): + if has_out_of_date_replicas(run_model): # allow extra replicas when deployment is in progress max_replica_count += ROLLING_DEPLOYMENT_MAX_SURGE @@ -506,7 +571,7 @@ async def _handle_run_replicas( run_model=run_model, run_spec=run_spec, ) - if _has_out_of_date_replicas(run_model): + if has_out_of_date_replicas(run_model): assert run_spec.configuration.type == "service", ( "Rolling deployment is only supported for services" ) @@ -551,7 +616,10 @@ async def _handle_run_replicas( async def _update_jobs_to_new_deployment_in_place( - session: AsyncSession, run_model: RunModel, run_spec: RunSpec + session: AsyncSession, + run_model: RunModel, + run_spec: RunSpec, + replicas: Optional[List] = None, ) -> None: """ Bump deployment_num for jobs that do not require redeployment. @@ -560,16 +628,26 @@ async def _update_jobs_to_new_deployment_in_place( session=session, project=run_model.project, ) + for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs): if all(j.status.is_finished() for j in job_models): continue if all(j.deployment_num == run_model.deployment_num for j in job_models): continue + + # Determine which group this replica belongs to + replica_group_name = None + + if replicas: + job_spec = JobSpec.__response__.parse_raw(job_models[0].job_spec_data) + replica_group_name = job_spec.replica_group + # FIXME: Handle getting image configuration errors or skip it. new_job_specs = await get_job_specs_from_run_spec( run_spec=run_spec, secrets=secrets, replica_num=replica_num, + replica_group_name=replica_group_name, ) assert len(new_job_specs) == len(job_models), ( "Changing the number of jobs within a replica is not yet supported" @@ -585,15 +663,6 @@ async def _update_jobs_to_new_deployment_in_place( job_model.deployment_num = run_model.deployment_num -def _has_out_of_date_replicas(run: RunModel) -> bool: - for job in run.jobs: - if job.deployment_num < run.deployment_num and not ( - job.status.is_finished() or job.termination_reason == JobTerminationReason.SCALED_DOWN - ): - return True - return False - - async def _should_retry_job( session: AsyncSession, run: Run, @@ -671,3 +740,103 @@ def _should_stop_on_master_done(run: Run) -> bool: if is_master_job(job) and job.job_submissions[-1].status == JobStatus.DONE: return True return False + + +async def _handle_rolling_deployment_for_group( + session: AsyncSession, run_model: RunModel, group: ReplicaGroup, run_spec: RunSpec +) -> None: + """ + Handle rolling deployment for a single replica group. + """ + from dstack._internal.server.services.runs.replicas import ( + build_replica_lists, + scale_run_replicas_for_group, + ) + + desired_replica_counts = ( + json.loads(run_model.desired_replica_counts) if run_model.desired_replica_counts else {} + ) + + group_desired = desired_replica_counts.get(group.name, group.count.min or 0) + + # Check if group has out-of-date replicas + if not has_out_of_date_replicas(run_model, group_filter=group.name): + return # Group is up-to-date + + # Calculate max replicas (allow surge during deployment) + group_max_replica_count = group_desired + ROLLING_DEPLOYMENT_MAX_SURGE + + # Count non-terminated replicas for this group only + + non_terminated_replica_count = len( + { + j.replica_num + for j in run_model.jobs + if not j.status.is_finished() + and group.name is not None + and job_belongs_to_group(job=j, group_name=group.name) + } + ) + + # Start new up-to-date replicas if needed + if non_terminated_replica_count < group_max_replica_count: + active_replicas, inactive_replicas = build_replica_lists( + run_model=run_model, + group_filter=group.name, + ) + + await scale_run_replicas_for_group( + session=session, + run_model=run_model, + group=group, + replicas_diff=group_max_replica_count - non_terminated_replica_count, + run_spec=run_spec, + active_replicas=active_replicas, + inactive_replicas=inactive_replicas, + ) + + # Stop out-of-date replicas that are not registered + replicas_to_stop_count = 0 + for _, jobs in group_jobs_by_replica_latest(run_model.jobs): + assert group.name is not None, "Group name is always set" + if not job_belongs_to_group(jobs[0], group.name): + continue + # Check if replica is out-of-date and not registered + if ( + any(j.deployment_num < run_model.deployment_num for j in jobs) + and any( + j.status not in [JobStatus.TERMINATING] + JobStatus.finished_statuses() + for j in jobs + ) + and not is_replica_registered(jobs) + ): + replicas_to_stop_count += 1 + + # Stop excessive registered out-of-date replicas + non_terminating_registered_replicas_count = 0 + for _, jobs in group_jobs_by_replica_latest(run_model.jobs): + assert group.name is not None, "Group name is always set" + if not job_belongs_to_group(jobs[0], group.name): + continue + + if is_replica_registered(jobs) and all(j.status != JobStatus.TERMINATING for j in jobs): + non_terminating_registered_replicas_count += 1 + + replicas_to_stop_count += max(0, non_terminating_registered_replicas_count - group_desired) + + if replicas_to_stop_count > 0: + # Build lists again to get current state + active_replicas, inactive_replicas = build_replica_lists( + run_model=run_model, + group_filter=group.name, + ) + + await scale_run_replicas_for_group( + session=session, + run_model=run_model, + group=group, + replicas_diff=-replicas_to_stop_count, + run_spec=run_spec, + active_replicas=active_replicas, + inactive_replicas=inactive_replicas, + ) diff --git a/src/dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py b/src/dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py new file mode 100644 index 0000000000..e993df7bec --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py @@ -0,0 +1,26 @@ +"""add runmodel desired_replica_counts + +Revision ID: 706e0acc3a7d +Revises: 903c91e24634 +Create Date: 2025-12-18 10:54:13.508297 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "706e0acc3a7d" +down_revision = "903c91e24634" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.add_column(sa.Column("desired_replica_counts", sa.Text(), nullable=True)) + + +def downgrade() -> None: + with op.batch_alter_table("runs", schema=None) as batch_op: + batch_op.drop_column("desired_replica_counts") diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index 6a8aa41eb4..982ade215f 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -405,7 +405,7 @@ class RunModel(BaseModel): priority: Mapped[int] = mapped_column(Integer, default=0) deployment_num: Mapped[int] = mapped_column(Integer) desired_replica_count: Mapped[int] = mapped_column(Integer) - + desired_replica_counts: Mapped[Optional[str]] = mapped_column(Text, nullable=True) jobs: Mapped[List["JobModel"]] = relationship( back_populates="run", lazy="selectin", order_by="[JobModel.replica_num, JobModel.job_num]" ) diff --git a/src/dstack/_internal/server/services/jobs/__init__.py b/src/dstack/_internal/server/services/jobs/__init__.py index 18d410c133..2ddadbfb1e 100644 --- a/src/dstack/_internal/server/services/jobs/__init__.py +++ b/src/dstack/_internal/server/services/jobs/__init__.py @@ -99,7 +99,10 @@ def switch_job_status( async def get_jobs_from_run_spec( - run_spec: RunSpec, secrets: Dict[str, str], replica_num: int + run_spec: RunSpec, + secrets: Dict[str, str], + replica_num: int, + replica_group_name: Optional[str] = None, ) -> List[Job]: return [ Job(job_spec=s, job_submissions=[]) @@ -107,14 +110,20 @@ async def get_jobs_from_run_spec( run_spec=run_spec, secrets=secrets, replica_num=replica_num, + replica_group_name=replica_group_name, ) ] async def get_job_specs_from_run_spec( - run_spec: RunSpec, secrets: Dict[str, str], replica_num: int + run_spec: RunSpec, + secrets: Dict[str, str], + replica_num: int, + replica_group_name: Optional[str] = None, ) -> List[JobSpec]: - job_configurator = _get_job_configurator(run_spec=run_spec, secrets=secrets) + job_configurator = _get_job_configurator( + run_spec=run_spec, secrets=secrets, replica_group_name=replica_group_name + ) job_specs = await job_configurator.get_job_specs(replica_num=replica_num) return job_specs @@ -242,10 +251,14 @@ def is_master_job(job: Job) -> bool: return job.job_spec.job_num == 0 -def _get_job_configurator(run_spec: RunSpec, secrets: Dict[str, str]) -> JobConfigurator: +def _get_job_configurator( + run_spec: RunSpec, secrets: Dict[str, str], replica_group_name: Optional[str] = None +) -> JobConfigurator: configuration_type = RunConfigurationType(run_spec.configuration.type) configurator_class = _configuration_type_to_configurator_class_map[configuration_type] - return configurator_class(run_spec=run_spec, secrets=secrets) + return configurator_class( + run_spec=run_spec, secrets=secrets, replica_group_name=replica_group_name + ) _job_configurator_classes = [ diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index 72821f1dd3..df6738a774 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -16,6 +16,7 @@ DEFAULT_PROBE_READY_AFTER, DEFAULT_PROBE_TIMEOUT, DEFAULT_PROBE_URL, + DEFAULT_REPLICA_GROUP_NAME, LEGACY_REPO_DIR, PortMapping, ProbeConfig, @@ -90,9 +91,11 @@ def __init__( self, run_spec: RunSpec, secrets: Optional[Dict[str, str]] = None, + replica_group_name: Optional[str] = None, ): self.run_spec = run_spec self.secrets = secrets or {} + self.replica_group_name = replica_group_name async def get_job_specs(self, replica_num: int) -> List[JobSpec]: job_spec = await self._get_job_spec(replica_num=replica_num, job_num=0, jobs_per_replica=1) @@ -150,6 +153,7 @@ async def _get_job_spec( job_num=job_num, job_name=f"{self.run_spec.run_name}-{job_num}-{replica_num}", jobs_per_replica=jobs_per_replica, + replica_group=self.replica_group_name or DEFAULT_REPLICA_GROUP_NAME, app_specs=self._app_specs(), commands=await self._commands(), env=self._env(), @@ -308,9 +312,15 @@ def _registry_auth(self) -> Optional[RegistryAuth]: return self.run_spec.configuration.registry_auth def _requirements(self, jobs_per_replica: int) -> Requirements: + resources = self.run_spec.configuration.resources + if self.run_spec.configuration.type == "service": + for group in self.run_spec.configuration.replica_groups: + if group.name == self.replica_group_name: + resources = group.resources + break spot_policy = self._spot_policy() return Requirements( - resources=self.run_spec.configuration.resources, + resources=resources, max_price=self.run_spec.merged_profile.max_price, spot=None if spot_policy == SpotPolicy.AUTO else (spot_policy == SpotPolicy.SPOT), reservation=self.run_spec.merged_profile.reservation, diff --git a/src/dstack/_internal/server/services/jobs/configurators/dev.py b/src/dstack/_internal/server/services/jobs/configurators/dev.py index da683a60cc..42bdc4376e 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/dev.py +++ b/src/dstack/_internal/server/services/jobs/configurators/dev.py @@ -18,7 +18,9 @@ class DevEnvironmentJobConfigurator(JobConfigurator): TYPE: RunConfigurationType = RunConfigurationType.DEV_ENVIRONMENT - def __init__(self, run_spec: RunSpec, secrets: Dict[str, str]): + def __init__( + self, run_spec: RunSpec, secrets: Dict[str, str], replica_group_name: Optional[str] = None + ): assert run_spec.configuration.type == "dev-environment" if run_spec.configuration.ide == "vscode": @@ -34,7 +36,7 @@ def __init__(self, run_spec: RunSpec, secrets: Dict[str, str]): version=run_spec.configuration.version, extensions=["ms-python.python", "ms-toolsai.jupyter"], ) - super().__init__(run_spec=run_spec, secrets=secrets) + super().__init__(run_spec=run_spec, secrets=secrets, replica_group_name=replica_group_name) def _shell_commands(self) -> List[str]: assert self.run_spec.configuration.type == "dev-environment" diff --git a/src/dstack/_internal/server/services/jobs/configurators/service.py b/src/dstack/_internal/server/services/jobs/configurators/service.py index be15c4b23c..6b5aa8c2d3 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/service.py +++ b/src/dstack/_internal/server/services/jobs/configurators/service.py @@ -10,6 +10,9 @@ class ServiceJobConfigurator(JobConfigurator): def _shell_commands(self) -> List[str]: assert self.run_spec.configuration.type == "service" + for group in self.run_spec.configuration.replica_groups: + if group.name == self.replica_group_name: + return group.commands return self.run_spec.configuration.commands def _default_single_branch(self) -> bool: diff --git a/src/dstack/_internal/server/services/runs/__init__.py b/src/dstack/_internal/server/services/runs/__init__.py index 5773403cff..f99934dc3f 100644 --- a/src/dstack/_internal/server/services/runs/__init__.py +++ b/src/dstack/_internal/server/services/runs/__init__.py @@ -490,8 +490,6 @@ async def submit_run( if run_spec.merged_profile.schedule is not None: initial_status = RunStatus.PENDING initial_replicas = 0 - elif run_spec.configuration.type == "service": - initial_replicas = run_spec.configuration.replicas.min or 0 run_model = RunModel( id=uuid.uuid4(), @@ -519,32 +517,67 @@ async def submit_run( if run_spec.configuration.type == "service": await services.register_service(session, run_model, run_spec) + service_config = run_spec.configuration - for replica_num in range(initial_replicas): - jobs = await get_jobs_from_run_spec( - run_spec=run_spec, - secrets=secrets, - replica_num=replica_num, - ) - for job in jobs: - job_model = create_job_model_for_new_submission( - run_model=run_model, - job=job, - status=JobStatus.SUBMITTED, - ) - session.add(job_model) - events.emit( - session, - f"Job created on run submission. Status: {job_model.status.upper()}", - # Set `SystemActor` for consistency with all other places where jobs can be - # created (retry, scaling, rolling deployments, etc). Think of the run as being - # created by the user, while the job is created by the system to satisfy the - # run spec. - actor=events.SystemActor(), - targets=[ - events.Target.from_model(job_model), - ], + global_replica_num = 0 # Global counter across all groups for unique replica_num + + for replica_group in service_config.replica_groups: + if run_spec.merged_profile.schedule is not None: + group_initial_replicas = 0 + else: + group_initial_replicas = replica_group.count.min or 0 + + # Each replica in this group gets the same group-specific configuration + for group_replica_num in range(group_initial_replicas): + jobs = await get_jobs_from_run_spec( + run_spec=run_spec, + secrets=secrets, + replica_num=global_replica_num, + replica_group_name=replica_group.name, + ) + + for job in jobs: + job_model = create_job_model_for_new_submission( + run_model=run_model, + job=job, + status=JobStatus.SUBMITTED, + ) + session.add(job_model) + events.emit( + session, + f"Job created on run submission. Status: {job_model.status.upper()}", + actor=events.SystemActor(), + targets=[ + events.Target.from_model(job_model), + ], + ) + global_replica_num += 1 + else: + for replica_num in range(initial_replicas): + jobs = await get_jobs_from_run_spec( + run_spec=run_spec, + secrets=secrets, + replica_num=replica_num, ) + for job in jobs: + job_model = create_job_model_for_new_submission( + run_model=run_model, + job=job, + status=JobStatus.SUBMITTED, + ) + session.add(job_model) + events.emit( + session, + f"Job created on run submission. Status: {job_model.status.upper()}", + # Set `SystemActor` for consistency with all other places where jobs can be + # created (retry, scaling, rolling deployments, etc). Think of the run as being + # created by the user, while the job is created by the system to satisfy the + # run spec. + actor=events.SystemActor(), + targets=[ + events.Target.from_model(job_model), + ], + ) await session.commit() await session.refresh(run_model) diff --git a/src/dstack/_internal/server/services/runs/plan.py b/src/dstack/_internal/server/services/runs/plan.py index dd1ad1b284..5e3b6e5a02 100644 --- a/src/dstack/_internal/server/services/runs/plan.py +++ b/src/dstack/_internal/server/services/runs/plan.py @@ -78,57 +78,117 @@ async def get_job_plans( run_spec.run_name = "dry-run" secrets = await get_project_secrets_mapping(session=session, project=project) - jobs = await get_jobs_from_run_spec( - run_spec=run_spec, - secrets=secrets, - replica_num=0, - ) - volumes = await get_job_configured_volumes( - session=session, - project=project, - run_spec=run_spec, - job_num=0, - ) - candidate_fleet_models = await _select_candidate_fleet_models( - session=session, - project=project, - run_model=None, - run_spec=run_spec, - ) - fleet_model, instance_offers, backend_offers = await find_optimal_fleet_with_offers( - project=project, - fleet_models=candidate_fleet_models, - run_model=None, - run_spec=run_spec, - job=jobs[0], - master_job_provisioning_data=None, - volumes=volumes, - exclude_not_available=False, - ) - if _should_force_non_fleet_offers(run_spec) or ( - FeatureFlags.AUTOCREATED_FLEETS_ENABLED and profile.fleets is None and fleet_model is None - ): - # Keep the old behavior returning all offers irrespective of fleets. - # Needed for supporting offers with autocreated fleets flow (and for `dstack offer`). - instance_offers, backend_offers = await _get_non_fleet_offers( + + job_plans = [] + + if run_spec.configuration.type == "service": + volumes = await get_job_configured_volumes( session=session, project=project, - profile=profile, + run_spec=run_spec, + job_num=0, + ) + candidate_fleet_models = await _select_candidate_fleet_models( + session=session, + project=project, + run_model=None, + run_spec=run_spec, + ) + for replica_group in run_spec.configuration.replica_groups: + jobs = await get_jobs_from_run_spec( + run_spec=run_spec, + secrets=secrets, + replica_num=0, + replica_group_name=replica_group.name, + ) + fleet_model, instance_offers, backend_offers = await find_optimal_fleet_with_offers( + project=project, + fleet_models=candidate_fleet_models, + run_model=None, + run_spec=run_spec, + job=jobs[0], + master_job_provisioning_data=None, + volumes=volumes, + exclude_not_available=False, + ) + if _should_force_non_fleet_offers(run_spec) or ( + FeatureFlags.AUTOCREATED_FLEETS_ENABLED + and profile.fleets is None + and fleet_model is None + ): + # Keep the old behavior returning all offers irrespective of fleets. + # Needed for supporting offers with autocreated fleets flow (and for `dstack offer`). + instance_offers, backend_offers = await _get_non_fleet_offers( + session=session, + project=project, + profile=profile, + run_spec=run_spec, + job=jobs[0], + volumes=volumes, + ) + + for job in jobs: + job_plan = _get_job_plan( + instance_offers=instance_offers, + backend_offers=backend_offers, + profile=profile, + job=job, + max_offers=max_offers, + ) + job_plans.append(job_plan) + else: + jobs = await get_jobs_from_run_spec( + run_spec=run_spec, + secrets=secrets, + replica_num=0, + ) + volumes = await get_job_configured_volumes( + session=session, + project=project, + run_spec=run_spec, + job_num=0, + ) + candidate_fleet_models = await _select_candidate_fleet_models( + session=session, + project=project, + run_model=None, + run_spec=run_spec, + ) + fleet_model, instance_offers, backend_offers = await find_optimal_fleet_with_offers( + project=project, + fleet_models=candidate_fleet_models, + run_model=None, run_spec=run_spec, job=jobs[0], + master_job_provisioning_data=None, volumes=volumes, + exclude_not_available=False, ) + if _should_force_non_fleet_offers(run_spec) or ( + FeatureFlags.AUTOCREATED_FLEETS_ENABLED + and profile.fleets is None + and fleet_model is None + ): + # Keep the old behavior returning all offers irrespective of fleets. + # Needed for supporting offers with autocreated fleets flow (and for `dstack offer`). + instance_offers, backend_offers = await _get_non_fleet_offers( + session=session, + project=project, + profile=profile, + run_spec=run_spec, + job=jobs[0], + volumes=volumes, + ) - job_plans = [] - for job in jobs: - job_plan = _get_job_plan( - instance_offers=instance_offers, - backend_offers=backend_offers, - profile=profile, - job=job, - max_offers=max_offers, - ) - job_plans.append(job_plan) + for job in jobs: + job_plan = _get_job_plan( + instance_offers=instance_offers, + backend_offers=backend_offers, + profile=profile, + job=job, + max_offers=max_offers, + ) + job_plans.append(job_plan) run_spec.run_name = run_name return job_plans diff --git a/src/dstack/_internal/server/services/runs/replicas.py b/src/dstack/_internal/server/services/runs/replicas.py index e994e77ddc..4f6c7ee19d 100644 --- a/src/dstack/_internal/server/services/runs/replicas.py +++ b/src/dstack/_internal/server/services/runs/replicas.py @@ -1,8 +1,10 @@ -from typing import List +import json +from typing import List, Optional, Tuple from sqlalchemy.ext.asyncio import AsyncSession -from dstack._internal.core.models.runs import JobStatus, JobTerminationReason, RunSpec +from dstack._internal.core.models.configurations import ReplicaGroup +from dstack._internal.core.models.runs import JobSpec, JobStatus, JobTerminationReason, RunSpec from dstack._internal.server.models import JobModel, RunModel from dstack._internal.server.services import events from dstack._internal.server.services.jobs import ( @@ -11,7 +13,10 @@ switch_job_status, ) from dstack._internal.server.services.logging import fmt -from dstack._internal.server.services.runs import create_job_model_for_new_submission, logger +from dstack._internal.server.services.runs import ( + create_job_model_for_new_submission, + logger, +) from dstack._internal.server.services.secrets import get_project_secrets_mapping @@ -23,10 +28,17 @@ async def retry_run_replica_jobs( session=session, project=run_model.project, ) + + # Determine replica group from existing job + run_spec = RunSpec.__response__.parse_raw(run_model.run_spec) + job_spec = JobSpec.__response__.parse_raw(latest_jobs[0].job_spec_data) + replica_group_name = job_spec.replica_group + new_jobs = await get_jobs_from_run_spec( - run_spec=RunSpec.__response__.parse_raw(run_model.run_spec), + run_spec=run_spec, secrets=secrets, replica_num=latest_jobs[0].replica_num, + replica_group_name=replica_group_name, ) assert len(new_jobs) == len(latest_jobs), ( "Changing the number of jobs within a replica is not yet supported" @@ -64,7 +76,6 @@ def is_replica_registered(jobs: list[JobModel]) -> bool: async def scale_run_replicas(session: AsyncSession, run_model: RunModel, replicas_diff: int): if replicas_diff == 0: - # nothing to do return logger.info( @@ -74,14 +85,42 @@ async def scale_run_replicas(session: AsyncSession, run_model: RunModel, replica abs(replicas_diff), ) + active_replicas, inactive_replicas = build_replica_lists(run_model) + run_spec = RunSpec.__response__.parse_raw(run_model.run_spec) + + if replicas_diff < 0: + scale_down_replicas(session, active_replicas, abs(replicas_diff)) + else: + await _scale_up_replicas( + session, + run_model, + inactive_replicas, + replicas_diff, + run_spec, + group_name=None, + ) + + +def build_replica_lists( + run_model: RunModel, + group_filter: Optional[str] = None, +) -> Tuple[ + List[Tuple[int, bool, int, List[JobModel]]], List[Tuple[int, bool, int, List[JobModel]]] +]: # lists of (importance, is_out_of_date, replica_num, jobs) active_replicas: list[tuple[int, bool, int, list[JobModel]]] = [] inactive_replicas: list[tuple[int, bool, int, list[JobModel]]] = [] for replica_num, replica_jobs in group_jobs_by_replica_latest(run_model.jobs): + # Filter by group if specified + if group_filter is not None: + if not job_belongs_to_group(replica_jobs[0], group_filter): + continue + statuses = set(job.status for job in replica_jobs) deployment_num = replica_jobs[0].deployment_num # same for all jobs is_out_of_date = deployment_num < run_model.deployment_num + if {JobStatus.TERMINATING, *JobStatus.finished_statuses()} & statuses: # if there are any terminating or finished jobs, the replica is inactive inactive_replicas.append((0, is_out_of_date, replica_num, replica_jobs)) @@ -98,42 +137,68 @@ async def scale_run_replicas(session: AsyncSession, run_model: RunModel, replica # all jobs are running and ready, the replica is active and has the importance of 3 active_replicas.append((3, is_out_of_date, replica_num, replica_jobs)) - # sort by is_out_of_date (up-to-date first), importance (desc), and replica_num (asc) + # Sort by is_out_of_date (up-to-date first), importance (desc), and replica_num (asc) active_replicas.sort(key=lambda r: (r[1], -r[0], r[2])) - run_spec = RunSpec.__response__.parse_raw(run_model.run_spec) - if replicas_diff < 0: - for _, _, _, replica_jobs in reversed(active_replicas[-abs(replicas_diff) :]): - # scale down the less important replicas first - for job in replica_jobs: - if job.status.is_finished() or job.status == JobStatus.TERMINATING: - continue - job.termination_reason = JobTerminationReason.SCALED_DOWN - switch_job_status(session, job, JobStatus.TERMINATING, events.SystemActor()) - # background task will process the job later - else: - scheduled_replicas = 0 + return active_replicas, inactive_replicas + + +def scale_down_replicas( + session: AsyncSession, + active_replicas: List[Tuple[int, bool, int, List[JobModel]]], + count: int, +) -> None: + """Scale down by terminating the least important replicas""" + if count <= 0: + return + + for _, _, _, replica_jobs in reversed(active_replicas[-count:]): + for job in replica_jobs: + if job.status.is_finished() or job.status == JobStatus.TERMINATING: + continue + job.termination_reason = JobTerminationReason.SCALED_DOWN + switch_job_status(session, job, JobStatus.TERMINATING, events.SystemActor()) + # background task will process the job later + + +async def _scale_up_replicas( + session: AsyncSession, + run_model: RunModel, + inactive_replicas: List[Tuple[int, bool, int, List[JobModel]]], + replicas_diff: int, + run_spec: RunSpec, + group_name: Optional[str] = None, +) -> None: + """Scale up by retrying inactive replicas and creating new ones""" + if replicas_diff <= 0: + return + + scheduled_replicas = 0 - # rerun inactive replicas - for _, _, _, replica_jobs in inactive_replicas: - if scheduled_replicas == replicas_diff: - break - await retry_run_replica_jobs(session, run_model, replica_jobs, only_failed=False) - scheduled_replicas += 1 + # Retry inactive replicas first + for _, _, _, replica_jobs in inactive_replicas: + if scheduled_replicas == replicas_diff: + break + await retry_run_replica_jobs(session, run_model, replica_jobs, only_failed=False) + scheduled_replicas += 1 + # Create new replicas + if scheduled_replicas < replicas_diff: secrets = await get_project_secrets_mapping( session=session, project=run_model.project, ) - for replica_num in range( - len(active_replicas) + scheduled_replicas, len(active_replicas) + replicas_diff - ): - # FIXME: Handle getting image configuration errors or skip it. + max_replica_num = max((job.replica_num for job in run_model.jobs), default=-1) + + new_replicas_needed = replicas_diff - scheduled_replicas + for i in range(new_replicas_needed): + new_replica_num = max_replica_num + 1 + i jobs = await get_jobs_from_run_spec( run_spec=run_spec, secrets=secrets, - replica_num=replica_num, + replica_num=new_replica_num, + replica_group_name=group_name, ) for job in jobs: job_model = create_job_model_for_new_submission( @@ -148,3 +213,105 @@ async def scale_run_replicas(session: AsyncSession, run_model: RunModel, replica actor=events.SystemActor(), targets=[events.Target.from_model(job_model)], ) + # Append to run_model.jobs so that when processing later replica groups in the same + # transaction, run_model.jobs includes jobs from previously processed groups. + run_model.jobs.append(job_model) + + +async def scale_run_replicas_per_group( + session: AsyncSession, + run_model: RunModel, + replicas: List[ReplicaGroup], +) -> None: + """Scale each replica group independently""" + if not replicas: + return + + desired_replica_counts = ( + json.loads(run_model.desired_replica_counts) if run_model.desired_replica_counts else {} + ) + + for group in replicas: + assert group.name is not None, "Group name is always set" + group_desired = desired_replica_counts.get(group.name, group.count.min or 0) + + # Build replica lists filtered by this group + active_replicas, inactive_replicas = build_replica_lists( + run_model=run_model, group_filter=group.name + ) + + # Count active replicas + active_group_count = len(active_replicas) + group_diff = group_desired - active_group_count + + if group_diff != 0: + # Check if rolling deployment is in progress for THIS GROUP + + group_has_out_of_date = has_out_of_date_replicas(run_model, group_filter=group.name) + + # During rolling deployment, don't scale down old replicas + # Let rolling deployment handle stopping old replicas + if group_diff < 0 and group_has_out_of_date: + # Skip scaling down during rolling deployment + continue + await scale_run_replicas_for_group( + session=session, + run_model=run_model, + group=group, + replicas_diff=group_diff, + run_spec=RunSpec.__response__.parse_raw(run_model.run_spec), + active_replicas=active_replicas, + inactive_replicas=inactive_replicas, + ) + + +async def scale_run_replicas_for_group( + session: AsyncSession, + run_model: RunModel, + group: ReplicaGroup, + replicas_diff: int, + run_spec: RunSpec, + active_replicas: List[Tuple[int, bool, int, List[JobModel]]], + inactive_replicas: List[Tuple[int, bool, int, List[JobModel]]], +) -> None: + """Scale a specific replica group up or down""" + if replicas_diff == 0: + return + + logger.info( + "%s: scaling %s %s replica(s) for group '%s'", + fmt(run_model), + "UP" if replicas_diff > 0 else "DOWN", + abs(replicas_diff), + group.name, + ) + + if replicas_diff < 0: + scale_down_replicas(session, active_replicas, abs(replicas_diff)) + else: + await _scale_up_replicas( + session=session, + run_model=run_model, + inactive_replicas=inactive_replicas, + replicas_diff=replicas_diff, + run_spec=run_spec, + group_name=group.name, + ) + + +def job_belongs_to_group(job: JobModel, group_name: str) -> bool: + job_spec = JobSpec.__response__.parse_raw(job.job_spec_data) + return job_spec.replica_group == group_name + + +def has_out_of_date_replicas(run: RunModel, group_filter: Optional[str] = None) -> bool: + for job in run.jobs: + # Filter jobs by group if specified + if group_filter is not None: + if not job_belongs_to_group(job, group_filter): + continue + if job.deployment_num < run.deployment_num and not ( + job.status.is_finished() or job.termination_reason == JobTerminationReason.SCALED_DOWN + ): + return True + return False diff --git a/src/dstack/_internal/server/services/runs/spec.py b/src/dstack/_internal/server/services/runs/spec.py index 73b6d9fc7a..db81eb724a 100644 --- a/src/dstack/_internal/server/services/runs/spec.py +++ b/src/dstack/_internal/server/services/runs/spec.py @@ -88,7 +88,9 @@ def validate_run_spec_and_set_defaults( f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s" ) if isinstance(run_spec.configuration, ServiceConfiguration): - if run_spec.merged_profile.schedule and run_spec.configuration.replicas.min == 0: + if run_spec.merged_profile.schedule and all( + group.count.min == 0 for group in run_spec.configuration.replica_groups + ): raise ServerClientError( "Scheduled services with autoscaling to zero are not supported" ) @@ -149,11 +151,10 @@ def get_nodes_required_num(run_spec: RunSpec) -> int: nodes_required_num = 1 if run_spec.configuration.type == "task": nodes_required_num = run_spec.configuration.nodes - elif ( - run_spec.configuration.type == "service" - and run_spec.configuration.replicas.min is not None - ): - nodes_required_num = run_spec.configuration.replicas.min + elif run_spec.configuration.type == "service": + nodes_required_num = sum( + group.count.min or 0 for group in run_spec.configuration.replica_groups + ) return nodes_required_num diff --git a/src/dstack/_internal/server/services/services/__init__.py b/src/dstack/_internal/server/services/services/__init__.py index 39e8e98c6a..1f7dcde79d 100644 --- a/src/dstack/_internal/server/services/services/__init__.py +++ b/src/dstack/_internal/server/services/services/__init__.py @@ -2,6 +2,7 @@ Application logic related to `type: service` runs. """ +import json import uuid from datetime import datetime from typing import Optional @@ -145,7 +146,11 @@ def _register_service_in_server(run_model: RunModel, run_spec: RunSpec) -> Servi "The `https` configuration property is not applicable when running services without a gateway." " Please configure a gateway or remove the `https` property from the service configuration" ) - if run_spec.configuration.replicas.min != run_spec.configuration.replicas.max: + # Check if any group has autoscaling (min != max) + has_autoscaling = any( + group.count.min != group.count.max for group in run_spec.configuration.replica_groups + ) + if has_autoscaling: raise ServerClientError( "Auto-scaling is not supported when running services without a gateway." " Please configure a gateway or set `replicas` to a fixed value in the service configuration" @@ -303,13 +308,24 @@ async def update_service_desired_replica_count( configuration: ServiceConfiguration, last_scaled_at: Optional[datetime], ) -> None: - scaler = get_service_scaler(configuration) stats = None if run_model.gateway_id is not None: conn = await get_or_add_gateway_connection(session, run_model.gateway_id) stats = await conn.get_stats(run_model.project.name, run_model.run_name) - run_model.desired_replica_count = scaler.get_desired_count( - current_desired_count=run_model.desired_replica_count, - stats=stats, - last_scaled_at=last_scaled_at, + replica_groups = configuration.replica_groups + desired_replica_counts = {} + total = 0 + prev_counts = ( + json.loads(run_model.desired_replica_counts) if run_model.desired_replica_counts else {} ) + for group in replica_groups: + scaler = get_service_scaler(group.count, group.scaling) + group_desired = scaler.get_desired_count( + current_desired_count=prev_counts.get(group.name, group.count.min or 0), + stats=stats, + last_scaled_at=last_scaled_at, + ) + desired_replica_counts[group.name] = group_desired + total += group_desired + run_model.desired_replica_counts = json.dumps(desired_replica_counts) + run_model.desired_replica_count = total diff --git a/src/dstack/_internal/server/services/services/autoscalers.py b/src/dstack/_internal/server/services/services/autoscalers.py index cd6d06e588..641d2cee4d 100644 --- a/src/dstack/_internal/server/services/services/autoscalers.py +++ b/src/dstack/_internal/server/services/services/autoscalers.py @@ -6,7 +6,8 @@ from pydantic import BaseModel import dstack._internal.utils.common as common_utils -from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.configurations import ScalingSpec +from dstack._internal.core.models.resources import Range from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats @@ -119,21 +120,21 @@ def get_desired_count( return new_desired_count -def get_service_scaler(conf: ServiceConfiguration) -> BaseServiceScaler: - assert conf.replicas.min is not None - assert conf.replicas.max is not None - if conf.scaling is None: +def get_service_scaler(count: Range[int], scaling: Optional[ScalingSpec]) -> BaseServiceScaler: + assert count.min is not None + assert count.max is not None + if scaling is None: return ManualScaler( - min_replicas=conf.replicas.min, - max_replicas=conf.replicas.max, + min_replicas=count.min, + max_replicas=count.max, ) - if conf.scaling.metric == "rps": + if scaling.metric == "rps": return RPSAutoscaler( # replicas count validated by configuration model - min_replicas=conf.replicas.min, - max_replicas=conf.replicas.max, - target=conf.scaling.target, - scale_up_delay=conf.scaling.scale_up_delay, - scale_down_delay=conf.scaling.scale_down_delay, + min_replicas=count.min, + max_replicas=count.max, + target=scaling.target, + scale_up_delay=scaling.scale_up_delay, + scale_down_delay=scaling.scale_down_delay, ) - raise ValueError(f"No scaler found for scaling parameters {conf.scaling}") + raise ValueError(f"No scaler found for scaling parameters {scaling}") diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 627fa8a167..9485ed1abb 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -257,6 +257,7 @@ def get_dev_env_run_plan_dict( "replica_num": 0, "job_num": 0, "jobs_per_replica": 1, + "replica_group": "default", "single_branch": False, "max_duration": None, "stop_duration": 300, @@ -493,6 +494,7 @@ def get_dev_env_run_dict( "replica_num": 0, "job_num": 0, "jobs_per_replica": 1, + "replica_group": "default", "single_branch": False, "max_duration": None, "stop_duration": 300, From be788642ce9bf7ab651f56ba75fb4a137e2dff13 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Fri, 23 Jan 2026 17:03:44 +0000 Subject: [PATCH 076/187] [chore]: Add `list_events` utility for unit tests (#3493) --- src/dstack/_internal/server/testing/common.py | 7 +++++++ .../server/background/tasks/test_process_events.py | 10 +++------- src/tests/_internal/server/services/test_instances.py | 10 ++++------ 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/dstack/_internal/server/testing/common.py b/src/dstack/_internal/server/testing/common.py index eb325ad156..640a3932dd 100644 --- a/src/dstack/_internal/server/testing/common.py +++ b/src/dstack/_internal/server/testing/common.py @@ -7,6 +7,7 @@ from uuid import UUID import gpuhunt +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.core.backends.base.compute import ( @@ -90,6 +91,7 @@ BackendModel, ComputeGroupModel, DecryptedString, + EventModel, FileArchiveModel, FleetModel, GatewayComputeModel, @@ -1111,6 +1113,11 @@ async def create_secret( return secret_model +async def list_events(session: AsyncSession) -> list[EventModel]: + res = await session.execute(select(EventModel).order_by(EventModel.recorded_at, EventModel.id)) + return list(res.scalars().all()) + + def get_private_key_string() -> str: return """ -----BEGIN RSA PRIVATE KEY----- diff --git a/src/tests/_internal/server/background/tasks/test_process_events.py b/src/tests/_internal/server/background/tasks/test_process_events.py index 899f2946e8..21043e0bae 100644 --- a/src/tests/_internal/server/background/tasks/test_process_events.py +++ b/src/tests/_internal/server/background/tasks/test_process_events.py @@ -3,14 +3,12 @@ import pytest from freezegun import freeze_time -from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.server import settings from dstack._internal.server.background.tasks.process_events import delete_events -from dstack._internal.server.models import EventModel from dstack._internal.server.services import events -from dstack._internal.server.testing.common import create_user +from dstack._internal.server.testing.common import create_user, list_events @pytest.mark.asyncio @@ -27,8 +25,7 @@ async def test_deletes_old_events(test_db, session: AsyncSession) -> None: ) await session.commit() - res = await session.execute(select(EventModel)) - all_events = res.scalars().all() + all_events = await list_events(session) assert len(all_events) == 10 with ( @@ -37,8 +34,7 @@ async def test_deletes_old_events(test_db, session: AsyncSession) -> None: ): await delete_events() - res = await session.execute(select(EventModel).order_by(EventModel.recorded_at)) - remaining_events = res.scalars().all() + remaining_events = await list_events(session) assert len(remaining_events) == 5 assert [e.message for e in remaining_events] == [ "Event 5", diff --git a/src/tests/_internal/server/services/test_instances.py b/src/tests/_internal/server/services/test_instances.py index 9e4cb02e3a..ca6432d61e 100644 --- a/src/tests/_internal/server/services/test_instances.py +++ b/src/tests/_internal/server/services/test_instances.py @@ -1,7 +1,6 @@ import uuid import pytest -from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession import dstack._internal.server.services.instances as instances_services @@ -15,13 +14,14 @@ Resources, ) from dstack._internal.core.models.profiles import Profile -from dstack._internal.server.models import EventModel, InstanceModel +from dstack._internal.server.models import InstanceModel from dstack._internal.server.testing.common import ( create_instance, create_project, create_user, get_volume, get_volume_configuration, + list_events, ) from dstack._internal.utils.common import get_current_datetime @@ -41,8 +41,7 @@ async def test_includes_termination_reason_in_event_messages_only_once( instances_services.switch_instance_status(session, instance, InstanceStatus.TERMINATING) instances_services.switch_instance_status(session, instance, InstanceStatus.TERMINATED) - res = await session.execute(select(EventModel)) - events = res.scalars().all() + events = await list_events(session) assert len(events) == 2 assert {e.message for e in events} == { "Instance status changed PENDING -> TERMINATING. Termination reason: ERROR (Some err)", @@ -63,8 +62,7 @@ async def test_includes_termination_reason_in_event_message_when_switching_direc instance.termination_reason_message = "Some err" instances_services.switch_instance_status(session, instance, InstanceStatus.TERMINATED) - res = await session.execute(select(EventModel)) - events = res.scalars().all() + events = await list_events(session) assert len(events) == 1 assert events[0].message == ( "Instance status changed PENDING -> TERMINATED. Termination reason: ERROR (Some err)" From 5205c8916b10f345946ca8f09f1dba8d11446b65 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Sun, 25 Jan 2026 15:15:21 +0000 Subject: [PATCH 077/187] [Docs]: Fix k8s backend config example (#3495) --- docs/docs/concepts/backends.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index 574342318f..9a1c90ec5c 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -1015,13 +1015,13 @@ To use the `kubernetes` backend with `dstack`, you need to configure it with the ```yaml projects: - name: main - backends: - - type: kubernetes - kubeconfig: - filename: ~/.kube/config - proxy_jump: - hostname: 204.12.171.137 - port: 32000 + backends: + - type: kubernetes + kubeconfig: + filename: ~/.kube/config + proxy_jump: + hostname: 204.12.171.137 + port: 32000 ``` From 883b45550d10dffb6a98c3df51d203537bef4436 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Mon, 26 Jan 2026 12:15:02 +0500 Subject: [PATCH 078/187] Move ruff.toml to pyproject.toml (#3496) --- pyproject.toml | 14 ++++++++++++++ ruff.toml | 13 ------------- 2 files changed, 14 insertions(+), 13 deletions(-) delete mode 100644 ruff.toml diff --git a/pyproject.toml b/pyproject.toml index 37588341e5..748b5211a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,6 +81,20 @@ ignore-case = true [tool.uv.sources] dstack-plugin-server = { path = "examples/plugins/example_plugin_server", editable = true } +[tool.ruff] +target-version = "py39" +line-length = 99 + +[tool.ruff.lint] +select = ["E", "F", "I", "Q", "W", "PGH", "FLY", "S113"] +ignore = [ + "E501", + "E712", +] + +[tool.ruff.lint.isort] +known-first-party = ["dstack"] + [tool.pyright] typeCheckingMode = "standard" include = [ diff --git a/ruff.toml b/ruff.toml deleted file mode 100644 index b61fb7d00b..0000000000 --- a/ruff.toml +++ /dev/null @@ -1,13 +0,0 @@ -target-version = "py39" -line-length = 99 - -[lint] -select = ['E', 'F', 'I' ,'Q', 'W', 'PGH', 'FLY', 'S113'] -ignore = [ - 'E501', - 'E712', -] - -[lint.isort] -known-first-party = ["dstack"] -known-third-party = ["mkdocs_gen_files", "datacrunch"] From ca2172cd5f556c6c1941ed3fd4a47002cf79713c Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Mon, 26 Jan 2026 08:07:51 +0000 Subject: [PATCH 079/187] Events: instance/job reachability and health (#3482) Add new events: - Job became reachable/unreachable - Instance became reachable/unreachable - Instance health changed Additionally, do not set `unreachable=True` for instances in statuses other than `idle` and `busy` to avoid unnecessary and potentially misleading events during instance provisioning. --- .../background/tasks/process_instances.py | 31 +++++++++++++++++-- .../background/tasks/process_running_jobs.py | 29 ++++++++++++++--- .../tasks/test_process_instances.py | 10 ++++++ .../tasks/test_process_running_jobs.py | 4 +++ 4 files changed, 68 insertions(+), 6 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index c2bc27ee85..3cb53322a7 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -43,6 +43,7 @@ from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.fleets import InstanceGroupPlacement from dstack._internal.core.models.instances import ( + HealthStatus, InstanceAvailability, InstanceOfferWithAvailability, InstanceRuntime, @@ -75,6 +76,7 @@ InstanceHealthResponse, ) from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services import events from dstack._internal.server.services.fleets import ( fleet_model_to_fleet, get_create_instance_offers, @@ -759,8 +761,8 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non ) session.add(health_check_model) - instance.health = health_status - instance.unreachable = not instance_check.reachable + _set_health(session, instance, health_status) + _set_unreachable(session, instance, unreachable=not instance_check.reachable) if instance_check.reachable: instance.termination_deadline = None @@ -1093,6 +1095,31 @@ async def _terminate(session: AsyncSession, instance: InstanceModel) -> None: switch_instance_status(session, instance, InstanceStatus.TERMINATED) +def _set_health(session: AsyncSession, instance: InstanceModel, health: HealthStatus) -> None: + if instance.health != health: + events.emit( + session, + f"Instance health changed {instance.health.upper()} -> {health.upper()}", + actor=events.SystemActor(), + targets=[events.Target.from_model(instance)], + ) + instance.health = health + + +def _set_unreachable(session: AsyncSession, instance: InstanceModel, unreachable: bool) -> None: + if ( + instance.status.is_available() # avoid misleading event during provisioning + and instance.unreachable != unreachable + ): + events.emit( + session, + "Instance became unreachable" if unreachable else "Instance became reachable", + actor=events.SystemActor(), + targets=[events.Target.from_model(instance)], + ) + instance.unreachable = unreachable + + def _next_termination_retry_at(instance: InstanceModel) -> datetime.datetime: assert instance.last_termination_retry_at is not None return instance.last_termination_retry_at + TERMINATION_RETRY_TIMEOUT diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py index f5ca6c61ae..9de0fffcc0 100644 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_running_jobs.py @@ -51,9 +51,9 @@ UserModel, ) from dstack._internal.server.schemas.runner import GPUDevice, TaskStatus +from dstack._internal.server.services import events, services from dstack._internal.server.services import files as files_services from dstack._internal.server.services import logs as logs_services -from dstack._internal.server.services import services from dstack._internal.server.services.instances import get_instance_ssh_private_keys from dstack._internal.server.services.jobs import ( find_job, @@ -355,7 +355,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel): ) if success: - job_model.disconnected_at = None + _reset_disconnected_at(session, job_model) else: if job_model.termination_reason: logger.warning( @@ -368,8 +368,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel): # job will be terminated and instance will be emptied by process_terminating_jobs else: # No job_model.termination_reason set means ssh connection failed - if job_model.disconnected_at is None: - job_model.disconnected_at = common_utils.get_current_datetime() + _set_disconnected_at_now(session, job_model) if _should_terminate_job_due_to_disconnect(job_model): # TODO: Replace with JobTerminationReason.INSTANCE_UNREACHABLE for on-demand. job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY @@ -933,6 +932,28 @@ def _should_terminate_due_to_low_gpu_util(min_util: int, gpus_util: Iterable[Ite return False +def _set_disconnected_at_now(session: AsyncSession, job_model: JobModel) -> None: + if job_model.disconnected_at is None: + job_model.disconnected_at = common_utils.get_current_datetime() + events.emit( + session, + "Job became unreachable", + actor=events.SystemActor(), + targets=[events.Target.from_model(job_model)], + ) + + +def _reset_disconnected_at(session: AsyncSession, job_model: JobModel) -> None: + if job_model.disconnected_at is not None: + job_model.disconnected_at = None + events.emit( + session, + "Job became reachable", + actor=events.SystemActor(), + targets=[events.Target.from_model(job_model)], + ) + + def _get_cluster_info( jobs: List[Job], replica_num: int, diff --git a/src/tests/_internal/server/background/tasks/test_process_instances.py b/src/tests/_internal/server/background/tasks/test_process_instances.py index 38bffc4421..8691f3e7e6 100644 --- a/src/tests/_internal/server/background/tasks/test_process_instances.py +++ b/src/tests/_internal/server/background/tasks/test_process_instances.py @@ -77,6 +77,7 @@ get_job_provisioning_data, get_placement_group_provisioning_data, get_remote_connection_info, + list_events, ) from dstack._internal.utils.common import get_current_datetime @@ -324,10 +325,13 @@ async def test_check_shim_process_ureachable_state( healthcheck.assert_called() await session.refresh(instance) + events = await list_events(session) assert instance is not None assert instance.status == InstanceStatus.IDLE assert not instance.unreachable + assert len(events) == 1 + assert events[0].message == "Instance became reachable" @pytest.mark.asyncio @pytest.mark.parametrize("health_status", [HealthStatus.HEALTHY, HealthStatus.FAILURE]) @@ -351,12 +355,15 @@ async def test_check_shim_switch_to_unreachable_state( await process_instances() await session.refresh(instance) + events = await list_events(session) assert instance is not None assert instance.status == InstanceStatus.IDLE assert instance.unreachable # Should keep the previous status assert instance.health == health_status + assert len(events) == 1 + assert events[0].message == "Instance became unreachable" @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @@ -384,11 +391,14 @@ async def test_check_shim_check_instance_health(self, test_db, session: AsyncSes await process_instances() await session.refresh(instance) + events = await list_events(session) assert instance is not None assert instance.status == InstanceStatus.IDLE assert not instance.unreachable assert instance.health == HealthStatus.WARNING + assert len(events) == 1 + assert events[0].message == "Instance health changed HEALTHY -> WARNING" res = await session.execute(select(InstanceHealthCheckModel)) health_check = res.scalars().one() diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py index e3cc011be9..601fbe1ee7 100644 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py @@ -68,6 +68,7 @@ get_job_runtime_data, get_run_spec, get_volume_configuration, + list_events, ) from dstack._internal.utils.common import get_current_datetime @@ -515,9 +516,12 @@ async def test_pulling_shim_failed(self, test_db, session: AsyncSession): await process_running_jobs() assert SSHTunnelMock.call_count == 3 await session.refresh(job) + events = await list_events(session) assert job is not None assert job.disconnected_at is not None assert job.status == JobStatus.PULLING + assert len(events) == 1 + assert events[0].message == "Job became unreachable" with ( patch("dstack._internal.server.services.runner.ssh.SSHTunnel") as SSHTunnelMock, patch("dstack._internal.server.services.runner.ssh.time.sleep"), From 90b0579212d5a9bf90946c2c5a8c3fe448585ca6 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Mon, 26 Jan 2026 08:08:07 +0000 Subject: [PATCH 080/187] Volume events (#3494) - Support volume events in the API, CLI, and UI - Add the following events: - Volume created - Volume status changed - Volume deleted - Volume deleted due to exceeding `auto_cleanup_duration` --- .../List/hooks/useColumnDefinitions.tsx | 13 +++++ .../src/pages/Events/List/hooks/useFilters.ts | 9 ++++ frontend/src/types/event.d.ts | 5 +- src/dstack/_internal/cli/commands/event.py | 12 +++++ src/dstack/_internal/cli/services/events.py | 1 + src/dstack/_internal/core/models/events.py | 1 + .../background/tasks/process_idle_volumes.py | 9 +++- .../background/tasks/process_volumes.py | 9 ++-- src/dstack/_internal/server/models.py | 1 + src/dstack/_internal/server/routers/events.py | 1 + .../_internal/server/routers/volumes.py | 6 ++- src/dstack/_internal/server/schemas/events.py | 11 +++++ .../_internal/server/services/events.py | 17 +++++++ .../_internal/server/services/volumes.py | 48 ++++++++++++++----- src/dstack/api/server/_events.py | 4 ++ .../tasks/test_process_idle_volumes.py | 4 ++ .../tasks/test_process_submitted_volumes.py | 4 ++ .../_internal/server/routers/test_volumes.py | 9 ++++ 18 files changed, 142 insertions(+), 22 deletions(-) diff --git a/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx b/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx index 88e067814e..ad337cf5f1 100644 --- a/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx +++ b/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx @@ -112,6 +112,19 @@ export const useColumnsDefinitions = () => { ); + case 'volume': + return ( +
+ Volume{' '} + {target.project_name && ( + + {target.project_name} + + )} + /{target.name} +
+ ); + default: return '---'; } diff --git a/frontend/src/pages/Events/List/hooks/useFilters.ts b/frontend/src/pages/Events/List/hooks/useFilters.ts index 56aa1f67df..a3d510718f 100644 --- a/frontend/src/pages/Events/List/hooks/useFilters.ts +++ b/frontend/src/pages/Events/List/hooks/useFilters.ts @@ -17,6 +17,7 @@ type RequestParamsKeys = keyof Pick< | 'target_instances' | 'target_runs' | 'target_jobs' + | 'target_volumes' | 'within_projects' | 'within_fleets' | 'within_runs' @@ -31,6 +32,7 @@ const filterKeys: Record = { TARGET_INSTANCES: 'target_instances', TARGET_RUNS: 'target_runs', TARGET_JOBS: 'target_jobs', + TARGET_VOLUMES: 'target_volumes', WITHIN_PROJECTS: 'within_projects', WITHIN_FLEETS: 'within_fleets', WITHIN_RUNS: 'within_runs', @@ -47,6 +49,7 @@ const multipleChoiseKeys: RequestParamsKeys[] = [ 'target_instances', 'target_runs', 'target_jobs', + 'target_volumes', 'within_projects', 'within_fleets', 'within_runs', @@ -61,6 +64,7 @@ const targetTypes = [ { label: 'Instance', value: 'instance' }, { label: 'Run', value: 'run' }, { label: 'Job', value: 'job' }, + { label: 'Volume', value: 'volume' }, ]; export const useFilters = () => { @@ -153,6 +157,11 @@ export const useFilters = () => { operators: ['='], propertyLabel: 'Target jobs', }, + { + key: filterKeys.TARGET_VOLUMES, + operators: ['='], + propertyLabel: 'Target volumes', + }, { key: filterKeys.WITHIN_PROJECTS, diff --git a/frontend/src/types/event.d.ts b/frontend/src/types/event.d.ts index 4ef6bfcb89..3aadfa1f31 100644 --- a/frontend/src/types/event.d.ts +++ b/frontend/src/types/event.d.ts @@ -1,4 +1,4 @@ -declare type TEventTargetType = 'project' | 'user' | 'fleet' | 'instance' | 'run' | 'job'; +declare type TEventTargetType = 'project' | 'user' | 'fleet' | 'instance' | 'run' | 'job' | 'volume'; declare type TEventListRequestParams = Omit & { prev_recorded_at?: string; @@ -8,6 +8,7 @@ declare type TEventListRequestParams = Omit EventListFilters: filters.target_runs = [ api.client.runs.get(api.project, name).id for name in args.target_runs ] + elif args.target_volumes: + filters.target_volumes = [ + api.client.volumes.get(project_name=api.project, name=name).id + for name in args.target_volumes + ] if args.within_fleets: filters.within_fleets = [ diff --git a/src/dstack/_internal/cli/services/events.py b/src/dstack/_internal/cli/services/events.py index 4f3f620b76..c2903065c9 100644 --- a/src/dstack/_internal/cli/services/events.py +++ b/src/dstack/_internal/cli/services/events.py @@ -16,6 +16,7 @@ class EventListFilters: target_fleets: Optional[list[uuid.UUID]] = None target_runs: Optional[list[uuid.UUID]] = None + target_volumes: Optional[list[uuid.UUID]] = None within_projects: Optional[list[uuid.UUID]] = None within_fleets: Optional[list[uuid.UUID]] = None within_runs: Optional[list[uuid.UUID]] = None diff --git a/src/dstack/_internal/core/models/events.py b/src/dstack/_internal/core/models/events.py index fc7f51601a..6dae2dc178 100644 --- a/src/dstack/_internal/core/models/events.py +++ b/src/dstack/_internal/core/models/events.py @@ -16,6 +16,7 @@ class EventTargetType(str, Enum): INSTANCE = "instance" RUN = "run" JOB = "job" + VOLUME = "volume" class EventTarget(CoreModel): diff --git a/src/dstack/_internal/server/background/tasks/process_idle_volumes.py b/src/dstack/_internal/server/background/tasks/process_idle_volumes.py index 2557012c2b..cd5b66bc70 100644 --- a/src/dstack/_internal/server/background/tasks/process_idle_volumes.py +++ b/src/dstack/_internal/server/background/tasks/process_idle_volumes.py @@ -12,6 +12,7 @@ from dstack._internal.server.db import get_db, get_session_ctx from dstack._internal.server.models import ProjectModel, UserModel, VolumeModel from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services import events from dstack._internal.server.services.locking import get_locker from dstack._internal.server.services.volumes import ( get_volume_configuration, @@ -100,8 +101,12 @@ async def _delete_idle_volumes(session: AsyncSession, volumes: List[VolumeModel] volume_model.deleted = True volume_model.deleted_at = get_current_datetime() - - logger.info("Deleted idle volume %s", volume_model.name) + events.emit( + session=session, + message="Volume deleted due to exceeding auto_cleanup_duration", + actor=events.SystemActor(), + targets=[events.Target.from_model(volume_model)], + ) await session.commit() diff --git a/src/dstack/_internal/server/background/tasks/process_volumes.py b/src/dstack/_internal/server/background/tasks/process_volumes.py index 534af8d48f..66124619a4 100644 --- a/src/dstack/_internal/server/background/tasks/process_volumes.py +++ b/src/dstack/_internal/server/background/tasks/process_volumes.py @@ -16,6 +16,7 @@ from dstack._internal.server.services import backends as backends_services from dstack._internal.server.services import volumes as volumes_services from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.volumes import switch_volume_status from dstack._internal.server.utils import sentry_utils from dstack._internal.utils.common import get_current_datetime, run_async from dstack._internal.utils.logging import get_logger @@ -79,8 +80,8 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM volume.name, volume.configuration.backend.value, ) - volume_model.status = VolumeStatus.FAILED volume_model.status_message = "Backend not available" + switch_volume_status(session, volume_model, VolumeStatus.FAILED) volume_model.last_processed_at = get_current_datetime() await session.commit() return @@ -102,18 +103,18 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM ) except BackendError as e: logger.info("Failed to create volume %s: %s", volume_model.name, repr(e)) - volume_model.status = VolumeStatus.FAILED status_message = f"Backend error: {repr(e)}" if len(e.args) > 0: status_message = str(e.args[0]) volume_model.status_message = status_message + switch_volume_status(session, volume_model, VolumeStatus.FAILED) volume_model.last_processed_at = get_current_datetime() await session.commit() return except Exception as e: logger.exception("Got exception when creating volume %s", volume_model.name) - volume_model.status = VolumeStatus.FAILED volume_model.status_message = f"Unexpected error: {repr(e)}" + switch_volume_status(session, volume_model, VolumeStatus.FAILED) volume_model.last_processed_at = get_current_datetime() await session.commit() return @@ -123,6 +124,6 @@ async def _process_submitted_volume(session: AsyncSession, volume_model: VolumeM # Provisioned volumes marked as active since they become available almost immediately in AWS # TODO: Consider checking volume state volume_model.volume_provisioning_data = vpd.json() - volume_model.status = VolumeStatus.ACTIVE + switch_volume_status(session, volume_model, VolumeStatus.ACTIVE) volume_model.last_processed_at = get_current_datetime() await session.commit() diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index 982ade215f..7e9db282d1 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -745,6 +745,7 @@ class VolumeModel(BaseModel): deleted: Mapped[bool] = mapped_column(Boolean, default=False) deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + # NOTE: `status` must be changed only via `switch_volume_status()` status: Mapped[VolumeStatus] = mapped_column(EnumAsString(VolumeStatus, 100), index=True) status_message: Mapped[Optional[str]] = mapped_column(Text) diff --git a/src/dstack/_internal/server/routers/events.py b/src/dstack/_internal/server/routers/events.py index 3895767d6f..be75cccbb4 100644 --- a/src/dstack/_internal/server/routers/events.py +++ b/src/dstack/_internal/server/routers/events.py @@ -44,6 +44,7 @@ async def list_events( target_instances=body.target_instances, target_runs=body.target_runs, target_jobs=body.target_jobs, + target_volumes=body.target_volumes, within_projects=body.within_projects, within_fleets=body.within_fleets, within_runs=body.within_runs, diff --git a/src/dstack/_internal/server/routers/volumes.py b/src/dstack/_internal/server/routers/volumes.py index 2ac5034707..ead5465c48 100644 --- a/src/dstack/_internal/server/routers/volumes.py +++ b/src/dstack/_internal/server/routers/volumes.py @@ -116,5 +116,7 @@ async def delete_volumes( """ Deletes one or more volumes. """ - _, project = user_project - await volumes_services.delete_volumes(session=session, project=project, names=body.names) + user, project = user_project + await volumes_services.delete_volumes( + session=session, project=project, names=body.names, user=user + ) diff --git a/src/dstack/_internal/server/schemas/events.py b/src/dstack/_internal/server/schemas/events.py index 537a731970..66ea2e3404 100644 --- a/src/dstack/_internal/server/schemas/events.py +++ b/src/dstack/_internal/server/schemas/events.py @@ -80,6 +80,17 @@ class ListEventsRequest(CoreModel): max_items=MAX_FILTER_ITEMS, ), ] = None + target_volumes: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of volume IDs." + " The response will only include events that target the specified volumes" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None within_projects: Annotated[ Optional[list[uuid.UUID]], Field( diff --git a/src/dstack/_internal/server/services/events.py b/src/dstack/_internal/server/services/events.py index c9818ef9ee..80d81734b5 100644 --- a/src/dstack/_internal/server/services/events.py +++ b/src/dstack/_internal/server/services/events.py @@ -20,6 +20,7 @@ ProjectModel, RunModel, UserModel, + VolumeModel, ) from dstack._internal.server.services.logging import fmt_entity from dstack._internal.utils.common import get_current_datetime @@ -91,6 +92,7 @@ def from_model( ProjectModel, RunModel, UserModel, + VolumeModel, ], ) -> "Target": if isinstance(model, FleetModel): @@ -135,6 +137,13 @@ def from_model( id=model.id, name=model.name, ) + if isinstance(model, VolumeModel): + return Target( + type=EventTargetType.VOLUME, + project_id=model.project_id or model.project.id, + id=model.id, + name=model.name, + ) raise ValueError(f"Unsupported model type: {type(model)}") def fmt(self) -> str: @@ -212,6 +221,7 @@ async def list_events( target_instances: Optional[list[uuid.UUID]], target_runs: Optional[list[uuid.UUID]], target_jobs: Optional[list[uuid.UUID]], + target_volumes: Optional[list[uuid.UUID]], within_projects: Optional[list[uuid.UUID]], within_fleets: Optional[list[uuid.UUID]], within_runs: Optional[list[uuid.UUID]], @@ -281,6 +291,13 @@ async def list_events( EventTargetModel.entity_id.in_(target_jobs), ) ) + if target_volumes is not None: + target_filters.append( + and_( + EventTargetModel.entity_type == EventTargetType.VOLUME, + EventTargetModel.entity_id.in_(target_volumes), + ) + ) if within_projects is not None: target_filters.append(EventTargetModel.entity_project_id.in_(within_projects)) if within_fleets is not None: diff --git a/src/dstack/_internal/server/services/volumes.py b/src/dstack/_internal/server/services/volumes.py index eb8f4bab64..49a3d79594 100644 --- a/src/dstack/_internal/server/services/volumes.py +++ b/src/dstack/_internal/server/services/volumes.py @@ -2,7 +2,7 @@ from datetime import datetime, timedelta from typing import List, Optional -from sqlalchemy import and_, func, or_, select, update +from sqlalchemy import and_, func, or_, select from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload, selectinload @@ -33,6 +33,7 @@ VolumeModel, ) from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services import events from dstack._internal.server.services.instances import get_instance_provisioning_data from dstack._internal.server.services.locking import ( get_locker, @@ -46,6 +47,24 @@ logger = get_logger(__name__) +def switch_volume_status( + session: AsyncSession, + volume_model: VolumeModel, + new_status: VolumeStatus, + actor: events.AnyActor = events.SystemActor(), +): + old_status = volume_model.status + if old_status == new_status: + return + + volume_model.status = new_status + + msg = f"Volume status changed {old_status.upper()} -> {new_status.upper()}" + if volume_model.status_message is not None: + msg += f" ({volume_model.status_message})" + events.emit(session, msg, actor=actor, targets=[events.Target.from_model(volume_model)]) + + async def list_volumes( session: AsyncSession, user: UserModel, @@ -245,11 +264,19 @@ async def create_volume( attachments=[], ) session.add(volume_model) + events.emit( + session, + message=f"Volume created. Status: {volume_model.status.upper()}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(volume_model)], + ) await session.commit() return volume_model_to_volume(volume_model) -async def delete_volumes(session: AsyncSession, project: ProjectModel, names: List[str]): +async def delete_volumes( + session: AsyncSession, project: ProjectModel, names: List[str], user: UserModel +): res = await session.execute( select(VolumeModel).where( VolumeModel.project_id == project.id, @@ -287,17 +314,14 @@ async def delete_volumes(session: AsyncSession, project: ProjectModel, names: Li await _delete_volume(session=session, project=project, volume_model=volume_model) except Exception: logger.exception("Error when deleting volume %s", volume_model.name) - await session.execute( - update(VolumeModel) - .where( - VolumeModel.project_id == project.id, - VolumeModel.id.in_(volumes_ids), - ) - .values( - deleted=True, - deleted_at=common.get_current_datetime(), + volume_model.deleted = True + volume_model.deleted_at = common.get_current_datetime() + events.emit( + session, + message="Volume deleted", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(volume_model)], ) - ) await session.commit() diff --git a/src/dstack/api/server/_events.py b/src/dstack/api/server/_events.py index 2f5d6639a3..d9bf828394 100644 --- a/src/dstack/api/server/_events.py +++ b/src/dstack/api/server/_events.py @@ -27,6 +27,9 @@ def list( prev_id: Optional[UUID] = None, limit: int = LIST_EVENTS_DEFAULT_LIMIT, ascending: bool = False, + *, + # NOTE: New parameters go here. Avoid positional parameters, they can break compatibility. + target_volumes: Optional[list[UUID]] = None, ) -> list[Event]: if prev_recorded_at is not None: # Time zones other than UTC are misinterpreted by the server: @@ -39,6 +42,7 @@ def list( target_instances=target_instances, target_runs=target_runs, target_jobs=target_jobs, + target_volumes=target_volumes, within_projects=within_projects, within_fleets=within_fleets, within_runs=within_runs, diff --git a/src/tests/_internal/server/background/tasks/test_process_idle_volumes.py b/src/tests/_internal/server/background/tasks/test_process_idle_volumes.py index 13f557df2f..9d73afbb78 100644 --- a/src/tests/_internal/server/background/tasks/test_process_idle_volumes.py +++ b/src/tests/_internal/server/background/tasks/test_process_idle_volumes.py @@ -20,6 +20,7 @@ create_volume, get_volume_configuration, get_volume_provisioning_data, + list_events, ) from dstack._internal.utils.common import get_current_datetime @@ -73,10 +74,13 @@ async def test_deletes_idle_volumes(self, test_db, session: AsyncSession): await session.refresh(volume1) await session.refresh(volume2) + events = await list_events(session) assert volume1.deleted assert volume1.deleted_at is not None assert not volume2.deleted assert volume2.deleted_at is None + assert len(events) == 1 + assert events[0].message == "Volume deleted due to exceeding auto_cleanup_duration" @pytest.mark.asyncio diff --git a/src/tests/_internal/server/background/tasks/test_process_submitted_volumes.py b/src/tests/_internal/server/background/tasks/test_process_submitted_volumes.py index 0b2f0b1948..dfeef1e42e 100644 --- a/src/tests/_internal/server/background/tasks/test_process_submitted_volumes.py +++ b/src/tests/_internal/server/background/tasks/test_process_submitted_volumes.py @@ -11,6 +11,7 @@ create_project, create_user, create_volume, + list_events, ) @@ -51,3 +52,6 @@ async def test_provisiones_volumes(self, test_db, session: AsyncSession): aws_mock.compute.return_value.create_volume.assert_called_once() await session.refresh(volume) assert volume.status == VolumeStatus.ACTIVE + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume status changed SUBMITTED -> ACTIVE" diff --git a/src/tests/_internal/server/routers/test_volumes.py b/src/tests/_internal/server/routers/test_volumes.py index 1949747f62..f7719f9374 100644 --- a/src/tests/_internal/server/routers/test_volumes.py +++ b/src/tests/_internal/server/routers/test_volumes.py @@ -22,6 +22,7 @@ get_auth_headers, get_volume_configuration, get_volume_provisioning_data, + list_events, ) @@ -357,6 +358,9 @@ async def test_creates_volume(self, test_db, session: AsyncSession, client: Asyn } res = await session.execute(select(VolumeModel)) assert res.scalar_one() + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume created. Status: SUBMITTED" class TestDeleteVolumes: @@ -397,6 +401,9 @@ async def test_deletes_volumes(self, test_db, session: AsyncSession, client: Asy assert response.status_code == 200 await session.refresh(volume) assert volume.deleted + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume deleted" @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @@ -428,3 +435,5 @@ async def test_returns_400_when_volumes_in_use( assert response.status_code == 400 await session.refresh(volume) assert not volume.deleted + events = await list_events(session) + assert len(events) == 0 From 198534107ed07a50e05074395178a656305a76a2 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Mon, 26 Jan 2026 14:30:36 +0500 Subject: [PATCH 081/187] Set JobTerminationReason.INSTANCE_UNREACHABLE for unreachable on-demand instances (#3497) --- .../server/background/tasks/process_running_jobs.py | 8 ++++++-- .../server/background/tasks/test_process_running_jobs.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py index 9de0fffcc0..bcb35a0898 100644 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_running_jobs.py @@ -370,8 +370,12 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel): # No job_model.termination_reason set means ssh connection failed _set_disconnected_at_now(session, job_model) if _should_terminate_job_due_to_disconnect(job_model): - # TODO: Replace with JobTerminationReason.INSTANCE_UNREACHABLE for on-demand. - job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY + if job_provisioning_data.instance_type.resources.spot: + job_model.termination_reason = ( + JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY + ) + else: + job_model.termination_reason = JobTerminationReason.INSTANCE_UNREACHABLE job_model.termination_reason_message = "Instance is unreachable" switch_job_status(session, job_model, JobStatus.TERMINATING) else: diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py index 601fbe1ee7..9e318866c5 100644 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ b/src/tests/_internal/server/background/tasks/test_process_running_jobs.py @@ -532,7 +532,7 @@ async def test_pulling_shim_failed(self, test_db, session: AsyncSession): assert SSHTunnelMock.call_count == 3 await session.refresh(job) assert job.status == JobStatus.TERMINATING - assert job.termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY + assert job.termination_reason == JobTerminationReason.INSTANCE_UNREACHABLE assert job.remove_at is None @pytest.mark.asyncio From b4b69fb4d74d946ddc342fe0e9469466cd2c8844 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Tue, 27 Jan 2026 08:22:24 +0000 Subject: [PATCH 082/187] Support gateway events in API, CLI, and UI (#3499) --- .../List/hooks/useColumnDefinitions.tsx | 13 +++++++++++++ .../src/pages/Events/List/hooks/useFilters.ts | 9 +++++++++ frontend/src/types/event.d.ts | 5 +++-- src/dstack/_internal/cli/commands/event.py | 19 +++++++++++++++++++ src/dstack/_internal/cli/services/events.py | 1 + src/dstack/_internal/core/models/events.py | 1 + src/dstack/_internal/core/models/gateways.py | 4 ++++ src/dstack/_internal/server/routers/events.py | 1 + src/dstack/_internal/server/schemas/events.py | 11 +++++++++++ .../_internal/server/services/events.py | 17 +++++++++++++++++ .../server/services/gateways/__init__.py | 1 + src/dstack/api/server/_events.py | 2 ++ .../_internal/server/routers/test_gateways.py | 8 ++++++++ 13 files changed, 90 insertions(+), 2 deletions(-) diff --git a/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx b/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx index ad337cf5f1..d6e5b846ea 100644 --- a/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx +++ b/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx @@ -125,6 +125,19 @@ export const useColumnsDefinitions = () => { ); + case 'gateway': + return ( +
+ Gateway{' '} + {target.project_name && ( + + {target.project_name} + + )} + /{target.name} +
+ ); + default: return '---'; } diff --git a/frontend/src/pages/Events/List/hooks/useFilters.ts b/frontend/src/pages/Events/List/hooks/useFilters.ts index a3d510718f..d463770b30 100644 --- a/frontend/src/pages/Events/List/hooks/useFilters.ts +++ b/frontend/src/pages/Events/List/hooks/useFilters.ts @@ -18,6 +18,7 @@ type RequestParamsKeys = keyof Pick< | 'target_runs' | 'target_jobs' | 'target_volumes' + | 'target_gateways' | 'within_projects' | 'within_fleets' | 'within_runs' @@ -33,6 +34,7 @@ const filterKeys: Record = { TARGET_RUNS: 'target_runs', TARGET_JOBS: 'target_jobs', TARGET_VOLUMES: 'target_volumes', + TARGET_GATEWAYS: 'target_gateways', WITHIN_PROJECTS: 'within_projects', WITHIN_FLEETS: 'within_fleets', WITHIN_RUNS: 'within_runs', @@ -50,6 +52,7 @@ const multipleChoiseKeys: RequestParamsKeys[] = [ 'target_runs', 'target_jobs', 'target_volumes', + 'target_gateways', 'within_projects', 'within_fleets', 'within_runs', @@ -65,6 +68,7 @@ const targetTypes = [ { label: 'Run', value: 'run' }, { label: 'Job', value: 'job' }, { label: 'Volume', value: 'volume' }, + { label: 'Gateway', value: 'gateway' }, ]; export const useFilters = () => { @@ -162,6 +166,11 @@ export const useFilters = () => { operators: ['='], propertyLabel: 'Target volumes', }, + { + key: filterKeys.TARGET_GATEWAYS, + operators: ['='], + propertyLabel: 'Target gateways', + }, { key: filterKeys.WITHIN_PROJECTS, diff --git a/frontend/src/types/event.d.ts b/frontend/src/types/event.d.ts index 3aadfa1f31..618ea6673f 100644 --- a/frontend/src/types/event.d.ts +++ b/frontend/src/types/event.d.ts @@ -1,4 +1,4 @@ -declare type TEventTargetType = 'project' | 'user' | 'fleet' | 'instance' | 'run' | 'job' | 'volume'; +declare type TEventTargetType = 'project' | 'user' | 'fleet' | 'instance' | 'run' | 'job' | 'volume' | 'gateway'; declare type TEventListRequestParams = Omit & { prev_recorded_at?: string; @@ -9,6 +9,7 @@ declare type TEventListRequestParams = Omit EventListFilters: api.client.volumes.get(project_name=api.project, name=name).id for name in args.target_volumes ] + elif args.target_gateways: + filters.target_gateways = [] + for name in args.target_gateways: + id = api.client.gateways.get(api.project, name).id + if id is None: + # TODO(0.21): Remove this check once `Gateway.id` is required. + raise CLIError( + "Cannot determine gateway ID, most likely due to an outdated dstack server." + " Update the server to 0.20.7 or higher or remove --target-gateway." + ) + filters.target_gateways.append(id) if args.within_fleets: filters.within_fleets = [ diff --git a/src/dstack/_internal/cli/services/events.py b/src/dstack/_internal/cli/services/events.py index c2903065c9..11f764bd15 100644 --- a/src/dstack/_internal/cli/services/events.py +++ b/src/dstack/_internal/cli/services/events.py @@ -17,6 +17,7 @@ class EventListFilters: target_fleets: Optional[list[uuid.UUID]] = None target_runs: Optional[list[uuid.UUID]] = None target_volumes: Optional[list[uuid.UUID]] = None + target_gateways: Optional[list[uuid.UUID]] = None within_projects: Optional[list[uuid.UUID]] = None within_fleets: Optional[list[uuid.UUID]] = None within_runs: Optional[list[uuid.UUID]] = None diff --git a/src/dstack/_internal/core/models/events.py b/src/dstack/_internal/core/models/events.py index 6dae2dc178..289c4fc674 100644 --- a/src/dstack/_internal/core/models/events.py +++ b/src/dstack/_internal/core/models/events.py @@ -17,6 +17,7 @@ class EventTargetType(str, Enum): RUN = "run" JOB = "job" VOLUME = "volume" + GATEWAY = "gateway" class EventTarget(CoreModel): diff --git a/src/dstack/_internal/core/models/gateways.py b/src/dstack/_internal/core/models/gateways.py index 2dfeb5b181..b342c0a73b 100644 --- a/src/dstack/_internal/core/models/gateways.py +++ b/src/dstack/_internal/core/models/gateways.py @@ -1,4 +1,5 @@ import datetime +import uuid from enum import Enum from typing import Dict, Optional, Union @@ -93,6 +94,9 @@ class GatewaySpec(CoreModel): class Gateway(CoreModel): + # ID is only optional on the client side for compatibility with pre-0.20.7 servers. + # TODO(0.21): Make required. + id: Optional[uuid.UUID] = None name: str configuration: GatewayConfiguration created_at: datetime.datetime diff --git a/src/dstack/_internal/server/routers/events.py b/src/dstack/_internal/server/routers/events.py index be75cccbb4..4250eb4d7a 100644 --- a/src/dstack/_internal/server/routers/events.py +++ b/src/dstack/_internal/server/routers/events.py @@ -45,6 +45,7 @@ async def list_events( target_runs=body.target_runs, target_jobs=body.target_jobs, target_volumes=body.target_volumes, + target_gateways=body.target_gateways, within_projects=body.within_projects, within_fleets=body.within_fleets, within_runs=body.within_runs, diff --git a/src/dstack/_internal/server/schemas/events.py b/src/dstack/_internal/server/schemas/events.py index 66ea2e3404..30f7fe3244 100644 --- a/src/dstack/_internal/server/schemas/events.py +++ b/src/dstack/_internal/server/schemas/events.py @@ -91,6 +91,17 @@ class ListEventsRequest(CoreModel): max_items=MAX_FILTER_ITEMS, ), ] = None + target_gateways: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of gateway IDs." + " The response will only include events that target the specified gateways" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None within_projects: Annotated[ Optional[list[uuid.UUID]], Field( diff --git a/src/dstack/_internal/server/services/events.py b/src/dstack/_internal/server/services/events.py index 80d81734b5..c6d35a4577 100644 --- a/src/dstack/_internal/server/services/events.py +++ b/src/dstack/_internal/server/services/events.py @@ -14,6 +14,7 @@ EventModel, EventTargetModel, FleetModel, + GatewayModel, InstanceModel, JobModel, MemberModel, @@ -87,6 +88,7 @@ def __post_init__(self): def from_model( model: Union[ FleetModel, + GatewayModel, InstanceModel, JobModel, ProjectModel, @@ -102,6 +104,13 @@ def from_model( id=model.id, name=model.name, ) + if isinstance(model, GatewayModel): + return Target( + type=EventTargetType.GATEWAY, + project_id=model.project_id or model.project.id, + id=model.id, + name=model.name, + ) if isinstance(model, InstanceModel): return Target( type=EventTargetType.INSTANCE, @@ -222,6 +231,7 @@ async def list_events( target_runs: Optional[list[uuid.UUID]], target_jobs: Optional[list[uuid.UUID]], target_volumes: Optional[list[uuid.UUID]], + target_gateways: Optional[list[uuid.UUID]], within_projects: Optional[list[uuid.UUID]], within_fleets: Optional[list[uuid.UUID]], within_runs: Optional[list[uuid.UUID]], @@ -298,6 +308,13 @@ async def list_events( EventTargetModel.entity_id.in_(target_volumes), ) ) + if target_gateways is not None: + target_filters.append( + and_( + EventTargetModel.entity_type == EventTargetType.GATEWAY, + EventTargetModel.entity_id.in_(target_gateways), + ) + ) if within_projects is not None: target_filters.append(EventTargetModel.entity_project_id.in_(within_projects)) if within_fleets is not None: diff --git a/src/dstack/_internal/server/services/gateways/__init__.py b/src/dstack/_internal/server/services/gateways/__init__.py index 4ab80a8331..cf41b53973 100644 --- a/src/dstack/_internal/server/services/gateways/__init__.py +++ b/src/dstack/_internal/server/services/gateways/__init__.py @@ -558,6 +558,7 @@ def gateway_model_to_gateway(gateway_model: GatewayModel) -> Gateway: configuration = get_gateway_configuration(gateway_model) configuration.default = gateway_model.project.default_gateway_id == gateway_model.id return Gateway( + id=gateway_model.id, name=gateway_model.name, ip_address=ip_address, instance_id=instance_id, diff --git a/src/dstack/api/server/_events.py b/src/dstack/api/server/_events.py index d9bf828394..d403fb2427 100644 --- a/src/dstack/api/server/_events.py +++ b/src/dstack/api/server/_events.py @@ -30,6 +30,7 @@ def list( *, # NOTE: New parameters go here. Avoid positional parameters, they can break compatibility. target_volumes: Optional[list[UUID]] = None, + target_gateways: Optional[list[UUID]] = None, ) -> list[Event]: if prev_recorded_at is not None: # Time zones other than UTC are misinterpreted by the server: @@ -43,6 +44,7 @@ def list( target_runs=target_runs, target_jobs=target_jobs, target_volumes=target_volumes, + target_gateways=target_gateways, within_projects=within_projects, within_fleets=within_fleets, within_runs=within_runs, diff --git a/src/tests/_internal/server/routers/test_gateways.py b/src/tests/_internal/server/routers/test_gateways.py index b909c7d729..70f6b22b7e 100644 --- a/src/tests/_internal/server/routers/test_gateways.py +++ b/src/tests/_internal/server/routers/test_gateways.py @@ -17,6 +17,7 @@ create_user, get_auth_headers, ) +from dstack._internal.server.testing.matchers import SomeUUID4Str class TestListAndGetGateways: @@ -54,6 +55,7 @@ async def test_list(self, test_db, session: AsyncSession, client: AsyncClient): assert response.status_code == 200 assert response.json() == [ { + "id": SomeUUID4Str(), "backend": backend.type.value, "created_at": response.json()[0]["created_at"], "default": False, @@ -107,6 +109,7 @@ async def test_get(self, test_db, session: AsyncSession, client: AsyncClient): ) assert response.status_code == 200 assert response.json() == { + "id": SomeUUID4Str(), "backend": backend.type.value, "created_at": response.json()["created_at"], "default": False, @@ -189,6 +192,7 @@ async def test_create_gateway(self, test_db, session: AsyncSession, client: Asyn ) assert response.status_code == 200 assert response.json() == { + "id": SomeUUID4Str(), "name": "test", "backend": "aws", "region": "us", @@ -243,6 +247,7 @@ async def test_create_gateway_without_name( g.assert_called_once() assert response.status_code == 200 assert response.json() == { + "id": SomeUUID4Str(), "name": "random-name", "backend": "aws", "region": "us", @@ -347,6 +352,7 @@ async def test_set_default_gateway(self, test_db, session: AsyncSession, client: ) assert response.status_code == 200 assert response.json() == { + "id": SomeUUID4Str(), "backend": backend.type.value, "created_at": response.json()["created_at"], "default": True, @@ -471,6 +477,7 @@ def get_backend(project, backend_type): assert response.status_code == 200 assert response.json() == [ { + "id": str(gateway_gcp.id), "backend": backend_gcp.type.value, "created_at": response.json()[0]["created_at"], "default": False, @@ -542,6 +549,7 @@ async def test_set_wildcard_domain(self, test_db, session: AsyncSession, client: ) assert response.status_code == 200 assert response.json() == { + "id": SomeUUID4Str(), "backend": backend.type.value, "created_at": response.json()["created_at"], "status": "submitted", From 5efca70ef47c10ac19b5c7ebac6fa5dfdd13ba38 Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Tue, 27 Jan 2026 14:08:26 +0545 Subject: [PATCH 083/187] Use numeric replica-group names (#3502) Co-authored-by: Bihan Rana --- src/dstack/_internal/core/models/configurations.py | 12 ++++++------ src/dstack/_internal/core/services/__init__.py | 4 ++++ src/tests/_internal/server/routers/test_runs.py | 4 ++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 6bd9f0827e..3b2c7812b9 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -31,7 +31,7 @@ from dstack._internal.core.models.services import AnyModel, OpenAIChatModel from dstack._internal.core.models.unix import UnixUser from dstack._internal.core.models.volumes import MountPoint, VolumeConfiguration, parse_mount_point -from dstack._internal.core.services import is_valid_dstack_resource_name +from dstack._internal.core.services import is_valid_replica_group_name from dstack._internal.utils.common import has_duplicates, list_enum_values_for_annotation from dstack._internal.utils.json_schema import add_extra_schema_types from dstack._internal.utils.json_utils import ( @@ -55,7 +55,7 @@ DEFAULT_PROBE_READY_AFTER = 1 DEFAULT_PROBE_METHOD = "get" MAX_PROBE_URL_LEN = 2048 -DEFAULT_REPLICA_GROUP_NAME = "default" +DEFAULT_REPLICA_GROUP_NAME = "0" class RunConfigurationType(str, Enum): @@ -756,7 +756,7 @@ class ReplicaGroup(CoreModel): name: Annotated[ Optional[str], Field( - description="The name of the replica group. If not provided, defaults to 'replica-group-0', 'replica-group-1', etc. based on position." + description="The name of the replica group. If not provided, defaults to '0', '1', etc. based on position." ), ] count: Annotated[ @@ -784,8 +784,8 @@ class ReplicaGroup(CoreModel): @validator("name") def validate_name(cls, v: Optional[str]) -> Optional[str]: if v is not None: - if not is_valid_dstack_resource_name(v): - raise ValueError("Resource name should match regex '^[a-z][a-z0-9-]{1,40}$'") + if not is_valid_replica_group_name(v): + raise ValueError("Resource name should match regex '^[a-z0-9][a-z0-9-]{0,39}$'") return v @validator("count") @@ -920,7 +920,7 @@ def validate_replicas( # Assign default names to groups without names for index, group in enumerate(v): if group.name is None: - group.name = f"replica-group-{index}" + group.name = str(index) # Check for duplicate names names = [group.name for group in v] diff --git a/src/dstack/_internal/core/services/__init__.py b/src/dstack/_internal/core/services/__init__.py index 89ec6d8520..6d698e1228 100644 --- a/src/dstack/_internal/core/services/__init__.py +++ b/src/dstack/_internal/core/services/__init__.py @@ -10,3 +10,7 @@ def validate_dstack_resource_name(resource_name: str): def is_valid_dstack_resource_name(resource_name: str) -> bool: return re.match("^[a-z][a-z0-9-]{1,40}$", resource_name) is not None + + +def is_valid_replica_group_name(name: str) -> bool: + return re.match("^[a-z0-9][a-z0-9-]{0,39}$", name) is not None diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 9485ed1abb..70ab54bd16 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -257,7 +257,7 @@ def get_dev_env_run_plan_dict( "replica_num": 0, "job_num": 0, "jobs_per_replica": 1, - "replica_group": "default", + "replica_group": "0", "single_branch": False, "max_duration": None, "stop_duration": 300, @@ -494,7 +494,7 @@ def get_dev_env_run_dict( "replica_num": 0, "job_num": 0, "jobs_per_replica": 1, - "replica_group": "default", + "replica_group": "0", "single_branch": False, "max_duration": None, "stop_duration": 300, From b4c6f17839ed3e46d41e4af7470da4f6d32d60a1 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Tue, 27 Jan 2026 08:44:37 +0000 Subject: [PATCH 084/187] Add gateway lifecycle events (#3500) - Gateway created - Gateway status changed - Gateway deleted - Gateway set as default - Gateway unset as default - Gateway wildcard domain changed --- .../background/tasks/process_gateways.py | 24 +-- .../_internal/server/routers/gateways.py | 15 +- .../server/services/gateways/__init__.py | 141 ++++++++++++++---- src/dstack/_internal/server/testing/common.py | 15 +- .../background/tasks/test_process_gateways.py | 16 ++ .../_internal/server/routers/test_gateways.py | 57 ++++++- 6 files changed, 211 insertions(+), 57 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_gateways.py b/src/dstack/_internal/server/background/tasks/process_gateways.py index a54cb9e319..2566a4f4d8 100644 --- a/src/dstack/_internal/server/background/tasks/process_gateways.py +++ b/src/dstack/_internal/server/background/tasks/process_gateways.py @@ -14,6 +14,7 @@ GatewayConnection, create_gateway_compute, gateway_connections_pool, + switch_gateway_status, ) from dstack._internal.server.services.locking import advisory_lock_ctx, get_locker from dstack._internal.server.services.logging import fmt @@ -60,14 +61,6 @@ async def process_gateways(): logger.error( "%s: unexpected gateway status %r", fmt(gateway_model), initial_status.upper() ) - if gateway_model.status != initial_status: - logger.info( - "%s: gateway status has changed %s -> %s%s", - fmt(gateway_model), - initial_status.upper(), - gateway_model.status.upper(), - f": {gateway_model.status_message}" if gateway_model.status_message else "", - ) gateway_model.last_processed_at = get_current_datetime() await session.commit() finally: @@ -128,8 +121,8 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew project=gateway_model.project, backend_type=configuration.backend ) except BackendNotAvailable: - gateway_model.status = GatewayStatus.FAILED gateway_model.status_message = "Backend not available" + switch_gateway_status(session, gateway_model, GatewayStatus.FAILED) return try: @@ -140,18 +133,17 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew backend_id=backend_model.id, ) session.add(gateway_model) - gateway_model.status = GatewayStatus.PROVISIONING + switch_gateway_status(session, gateway_model, GatewayStatus.PROVISIONING) except BackendError as e: - logger.info("%s: failed to create gateway compute: %r", fmt(gateway_model), e) - gateway_model.status = GatewayStatus.FAILED status_message = f"Backend error: {repr(e)}" if len(e.args) > 0: status_message = str(e.args[0]) gateway_model.status_message = status_message + switch_gateway_status(session, gateway_model, GatewayStatus.FAILED) except Exception as e: logger.exception("%s: got exception when creating gateway compute", fmt(gateway_model)) - gateway_model.status = GatewayStatus.FAILED gateway_model.status_message = f"Unexpected error: {repr(e)}" + switch_gateway_status(session, gateway_model, GatewayStatus.FAILED) async def _process_provisioning_gateway( @@ -179,18 +171,18 @@ async def _process_provisioning_gateway( gateway_model.gateway_compute ) if connection is None: - gateway_model.status = GatewayStatus.FAILED gateway_model.status_message = "Failed to connect to gateway" + switch_gateway_status(session, gateway_model, GatewayStatus.FAILED) gateway_model.gateway_compute.deleted = True return try: await gateways_services.configure_gateway(connection) except Exception: logger.exception("%s: failed to configure gateway", fmt(gateway_model)) - gateway_model.status = GatewayStatus.FAILED gateway_model.status_message = "Failed to configure gateway" + switch_gateway_status(session, gateway_model, GatewayStatus.FAILED) await gateway_connections_pool.remove(gateway_model.gateway_compute.ip_address) gateway_model.gateway_compute.active = False return - gateway_model.status = GatewayStatus.RUNNING + switch_gateway_status(session, gateway_model, GatewayStatus.RUNNING) diff --git a/src/dstack/_internal/server/routers/gateways.py b/src/dstack/_internal/server/routers/gateways.py index fb03a3d69c..0f89e5db45 100644 --- a/src/dstack/_internal/server/routers/gateways.py +++ b/src/dstack/_internal/server/routers/gateways.py @@ -72,11 +72,12 @@ async def delete_gateways( session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), ): - _, project = user_project + user, project = user_project await gateways.delete_gateways( session=session, project=project, gateways_names=body.names, + user=user, ) @@ -86,8 +87,8 @@ async def set_default_gateway( session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), ): - _, project = user_project - await gateways.set_default_gateway(session=session, project=project, name=body.name) + user, project = user_project + await gateways.set_default_gateway(session=session, project=project, name=body.name, user=user) @router.post("/set_wildcard_domain", response_model=models.Gateway) @@ -96,9 +97,13 @@ async def set_gateway_wildcard_domain( session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), ): - _, project = user_project + user, project = user_project return CustomORJSONResponse( await gateways.set_gateway_wildcard_domain( - session=session, project=project, name=body.name, wildcard_domain=body.wildcard_domain + session=session, + project=project, + name=body.name, + wildcard_domain=body.wildcard_domain, + user=user, ) ) diff --git a/src/dstack/_internal/server/services/gateways/__init__.py b/src/dstack/_internal/server/services/gateways/__init__.py index cf41b53973..bff20466a8 100644 --- a/src/dstack/_internal/server/services/gateways/__init__.py +++ b/src/dstack/_internal/server/services/gateways/__init__.py @@ -1,6 +1,8 @@ import asyncio import datetime import uuid +from collections.abc import AsyncGenerator +from contextlib import asynccontextmanager from datetime import timedelta from functools import partial from typing import List, Optional, Sequence @@ -45,6 +47,7 @@ ProjectModel, UserModel, ) +from dstack._internal.server.services import events from dstack._internal.server.services.backends import ( check_backend_type_available, get_project_backend_by_type_or_error, @@ -66,6 +69,24 @@ logger = get_logger(__name__) +def switch_gateway_status( + session: AsyncSession, + gateway_model: GatewayModel, + new_status: GatewayStatus, + actor: events.AnyActor = events.SystemActor(), +): + old_status = gateway_model.status + if old_status == new_status: + return + + gateway_model.status = new_status + + msg = f"Gateway status changed {old_status.upper()} -> {new_status.upper()}" + if gateway_model.status_message is not None: + msg += f" ({gateway_model.status_message})" + events.emit(session, msg, actor=actor, targets=[events.Target.from_model(gateway_model)]) + + GATEWAY_CONNECT_ATTEMPTS = 30 GATEWAY_CONNECT_DELAY = 10 GATEWAY_CONFIGURE_ATTEMPTS = 50 @@ -163,6 +184,7 @@ async def create_gateway( configuration.name = await generate_gateway_name(session=session, project=project) gateway = GatewayModel( + id=uuid.uuid4(), name=configuration.name, region=configuration.region, project_id=project.id, @@ -173,11 +195,19 @@ async def create_gateway( last_processed_at=get_current_datetime(), ) session.add(gateway) + events.emit( + session, + f"Gateway created. Status: {gateway.status.upper()}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(gateway)], + ) await session.commit() default_gateway = await get_project_default_gateway_model(session=session, project=project) if default_gateway is None or configuration.default: - await set_default_gateway(session=session, project=project, name=configuration.name) + await set_default_gateway( + session=session, project=project, name=configuration.name, user=user + ) return gateway_model_to_gateway(gateway) @@ -214,6 +244,7 @@ async def delete_gateways( session: AsyncSession, project: ProjectModel, gateways_names: List[str], + user: UserModel, ): res = await session.execute( select(GatewayModel).where( @@ -273,46 +304,51 @@ async def delete_gateways( gateway_model.gateway_compute.deleted = True session.add(gateway_model.gateway_compute) await session.delete(gateway_model) + events.emit( + session, + "Gateway deleted", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(gateway_model)], + ) await session.commit() async def set_gateway_wildcard_domain( - session: AsyncSession, project: ProjectModel, name: str, wildcard_domain: Optional[str] + session: AsyncSession, + project: ProjectModel, + name: str, + wildcard_domain: Optional[str], + user: UserModel, ) -> Gateway: - gateway = await get_project_gateway_model_by_name( - session=session, - project=project, - name=name, - ) - if gateway is None: - raise ResourceNotExistsError() - if gateway.backend.type == BackendType.DSTACK: - raise ServerClientError("Custom domains for dstack Sky gateway are not supported") - await session.execute( - update(GatewayModel) - .where( - GatewayModel.project_id == project.id, - GatewayModel.name == name, - ) - .values( - wildcard_domain=wildcard_domain, - ) - ) - await session.commit() - gateway = await get_project_gateway_model_by_name( - session=session, - project=project, - name=name, - ) - if gateway is None: - raise ResourceNotExistsError() + async with get_project_gateway_model_by_name_for_update( + session=session, project=project, name=name + ) as gateway: + if gateway is None: + raise ResourceNotExistsError() + if gateway.backend.type == BackendType.DSTACK: + raise ServerClientError("Custom domains for dstack Sky gateway are not supported") + old_domain = gateway.wildcard_domain + if old_domain != wildcard_domain: + gateway.wildcard_domain = wildcard_domain + events.emit( + session, + f"Gateway wildcard domain changed {old_domain!r} -> {gateway.wildcard_domain!r}", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(gateway)], + ) + await session.commit() return gateway_model_to_gateway(gateway) -async def set_default_gateway(session: AsyncSession, project: ProjectModel, name: str): +async def set_default_gateway( + session: AsyncSession, project: ProjectModel, name: str, user: Optional[UserModel] +): gateway = await get_project_gateway_model_by_name(session=session, project=project, name=name) if gateway is None: raise ResourceNotExistsError() + if project.default_gateway_id == gateway.id: + return + previous_gateway = await get_project_default_gateway_model(session, project) await session.execute( update(ProjectModel) .where( @@ -322,6 +358,19 @@ async def set_default_gateway(session: AsyncSession, project: ProjectModel, name default_gateway_id=gateway.id, ) ) + if previous_gateway is not None: + events.emit( + session, + "Gateway unset as default", + actor=events.UserActor.from_user(user) if user is not None else events.SystemActor(), + targets=[events.Target.from_model(previous_gateway)], + ) + events.emit( + session, + "Gateway set as default", + actor=events.UserActor.from_user(user) if user is not None else events.SystemActor(), + targets=[events.Target.from_model(gateway)], + ) await session.commit() @@ -343,6 +392,38 @@ async def get_project_gateway_model_by_name( return res.scalar() +@asynccontextmanager +async def get_project_gateway_model_by_name_for_update( + session: AsyncSession, project: ProjectModel, name: str +) -> AsyncGenerator[Optional[GatewayModel], None]: + """ + Fetch the gateway from the database and lock it for update. + + **NOTE**: commit changes to the database before exiting from this context manager, + so that in-memory locks are only released after commit. + """ + + filters = [ + GatewayModel.project_id == project.id, + GatewayModel.name == name, + ] + res = await session.execute(select(GatewayModel.id).where(*filters)) + gateway_id = res.scalar_one_or_none() + if gateway_id is None: + yield None + else: + async with get_locker(get_db().dialect_name).lock_ctx( + GatewayModel.__tablename__, [gateway_id] + ): + # Refetch after lock + res = await session.execute( + select(GatewayModel) + .where(GatewayModel.id.in_([gateway_id]), *filters) + .with_for_update(key_share=True, of=GatewayModel) + ) + yield res.scalar_one_or_none() + + async def get_project_default_gateway_model( session: AsyncSession, project: ProjectModel ) -> Optional[GatewayModel]: diff --git a/src/dstack/_internal/server/testing/common.py b/src/dstack/_internal/server/testing/common.py index 640a3932dd..cca5212576 100644 --- a/src/dstack/_internal/server/testing/common.py +++ b/src/dstack/_internal/server/testing/common.py @@ -7,8 +7,9 @@ from uuid import UUID import gpuhunt -from sqlalchemy import select +from sqlalchemy import delete, select from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload from dstack._internal.core.backends.base.compute import ( Compute, @@ -1114,8 +1115,16 @@ async def create_secret( async def list_events(session: AsyncSession) -> list[EventModel]: - res = await session.execute(select(EventModel).order_by(EventModel.recorded_at, EventModel.id)) - return list(res.scalars().all()) + res = await session.execute( + select(EventModel) + .order_by(EventModel.recorded_at, EventModel.id) + .options(joinedload(EventModel.targets)) + ) + return list(res.scalars().unique().all()) + + +async def clear_events(session: AsyncSession) -> None: + await session.execute(delete(EventModel)) def get_private_key_string() -> str: diff --git a/src/tests/_internal/server/background/tasks/test_process_gateways.py b/src/tests/_internal/server/background/tasks/test_process_gateways.py index 3460f18cb9..b280b8948d 100644 --- a/src/tests/_internal/server/background/tasks/test_process_gateways.py +++ b/src/tests/_internal/server/background/tasks/test_process_gateways.py @@ -13,6 +13,7 @@ create_gateway, create_gateway_compute, create_project, + list_events, ) @@ -46,6 +47,9 @@ async def test_submitted_to_provisioning(self, test_db, session: AsyncSession): assert gateway.status == GatewayStatus.PROVISIONING assert gateway.gateway_compute is not None assert gateway.gateway_compute.ip_address == "2.2.2.2" + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway status changed SUBMITTED -> PROVISIONING" async def test_marks_gateway_as_failed_if_gateway_creation_errors( self, test_db, session: AsyncSession @@ -71,6 +75,9 @@ async def test_marks_gateway_as_failed_if_gateway_creation_errors( await session.refresh(gateway) assert gateway.status == GatewayStatus.FAILED assert gateway.status_message == "Some error" + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway status changed SUBMITTED -> FAILED (Some error)" @pytest.mark.asyncio @@ -96,6 +103,9 @@ async def test_provisioning_to_running(self, test_db, session: AsyncSession): pool_add.assert_called_once() await session.refresh(gateway) assert gateway.status == GatewayStatus.RUNNING + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway status changed PROVISIONING -> RUNNING" async def test_marks_gateway_as_failed_if_fails_to_connect( self, test_db, session: AsyncSession @@ -119,3 +129,9 @@ async def test_marks_gateway_as_failed_if_fails_to_connect( await session.refresh(gateway) assert gateway.status == GatewayStatus.FAILED assert gateway.status_message == "Failed to connect to gateway" + events = await list_events(session) + assert len(events) == 1 + assert ( + events[0].message + == "Gateway status changed PROVISIONING -> FAILED (Failed to connect to gateway)" + ) diff --git a/src/tests/_internal/server/routers/test_gateways.py b/src/tests/_internal/server/routers/test_gateways.py index 70f6b22b7e..f80537a1b1 100644 --- a/src/tests/_internal/server/routers/test_gateways.py +++ b/src/tests/_internal/server/routers/test_gateways.py @@ -10,12 +10,14 @@ from dstack._internal.server.services.projects import add_project_member from dstack._internal.server.testing.common import ( ComputeMockSpec, + clear_events, create_backend, create_gateway, create_gateway_compute, create_project, create_user, get_auth_headers, + list_events, ) from dstack._internal.server.testing.matchers import SomeUUID4Str @@ -218,6 +220,8 @@ async def test_create_gateway(self, test_db, session: AsyncSession, client: Asyn "tags": None, }, } + events = await list_events(session) + assert events[0].message == "Gateway created. Status: SUBMITTED" @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @@ -273,6 +277,8 @@ async def test_create_gateway_without_name( "tags": None, }, } + events = await list_events(session) + assert events[0].message == "Gateway created. Status: SUBMITTED" @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @@ -337,6 +343,7 @@ async def test_set_default_gateway(self, test_db, session: AsyncSession, client: project_id=project.id, backend_id=backend.id, gateway_compute_id=gateway_compute.id, + name="first_gateway", ) response = await client.post( f"/api/project/{project.name}/gateways/set_default", @@ -378,6 +385,40 @@ async def test_set_default_gateway(self, test_db, session: AsyncSession, client: "tags": None, }, } + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway set as default" + + second_gateway_compute = await create_gateway_compute( + session=session, + backend_id=backend.id, + ) + second_gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + gateway_compute_id=second_gateway_compute.id, + name="second_gateway", + ) + await clear_events(session) + response = await client.post( + f"/api/project/{project.name}/gateways/set_default", + json={"name": second_gateway.name}, + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + events = await list_events(session) + assert len(events) == 2 + actual_events = [(e.targets[0].entity_name, e.message) for e in events] + expected_events = [ + ("first_gateway", "Gateway unset as default"), + ("second_gateway", "Gateway set as default"), + ] + assert ( + actual_events == expected_events + # in case events are emitted exactly at the same time + or actual_events == expected_events[::-1] + ) @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @@ -504,6 +545,10 @@ def get_backend(project, backend_type): }, } ] + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway deleted" + assert events[0].targets[0].entity_name == "gateway-aws" class TestUpdateGateway: @@ -541,10 +586,11 @@ async def test_set_wildcard_domain(self, test_db, session: AsyncSession, client: project_id=project.id, backend_id=backend.id, gateway_compute_id=gateway_compute.id, + wildcard_domain="old.example", ) response = await client.post( f"/api/project/{project.name}/gateways/set_wildcard_domain", - json={"name": gateway.name, "wildcard_domain": "test.com"}, + json={"name": gateway.name, "wildcard_domain": "new.example"}, headers=get_auth_headers(user.token), ) assert response.status_code == 200 @@ -560,7 +606,7 @@ async def test_set_wildcard_domain(self, test_db, session: AsyncSession, client: "hostname": gateway_compute.ip_address, "name": gateway.name, "region": gateway.region, - "wildcard_domain": "test.com", + "wildcard_domain": "new.example", "configuration": { "type": "gateway", "name": gateway.name, @@ -568,13 +614,18 @@ async def test_set_wildcard_domain(self, test_db, session: AsyncSession, client: "region": gateway.region, "instance_type": None, "router": None, - "domain": "test.com", + "domain": "new.example", "default": False, "public_ip": True, "certificate": {"type": "lets-encrypt"}, "tags": None, }, } + events = await list_events(session) + assert len(events) == 1 + assert ( + events[0].message == "Gateway wildcard domain changed 'old.example' -> 'new.example'" + ) @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) From dde4a07055f77470048a61a195281d5583a18872 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Tue, 27 Jan 2026 13:56:37 +0100 Subject: [PATCH 085/187] Docs minor improvements (#3501) * [Docs] Improved documentation structure (WIP) - [x] Introduced `More` under `Concepts` - [x] Moved `Metrics` to `Concepts` - [x] Improved `Installation` (removed SSH fleets - only keep it in `Backends`; moved `Configure` after `Set up the server`) - [x] Mention `server restart is required after updating server/config.yml` in `Backends` - [x] Improved `Distributed tasks` (structure; links to `Fleets` and `Examples`) * [Docs] Documentation improvements - [x] Improved `Fleets` documentation - [x] Minor improvements of the `Tasks` page under `Concepts` - [x] Minor improvements on the home page * [Docs] Minor updates to `README.md`, `Overview`, `Fleets`, `Quickstart`, and examples --- README.md | 13 +- docs/blog/posts/gpu-health-checks.md | 4 +- docs/blog/posts/metrics-ui.md | 2 +- docs/blog/posts/prometheus.md | 4 +- docs/docs/concepts/backends.md | 21 +- docs/docs/concepts/fleets.md | 511 ++++++++---------- docs/docs/{guides => concepts}/metrics.md | 0 docs/docs/concepts/tasks.md | 23 +- docs/docs/guides/protips.md | 2 +- docs/docs/guides/troubleshooting.md | 2 +- docs/docs/index.md | 8 +- docs/docs/installation/index.md | 31 +- docs/docs/quickstart.md | 10 +- docs/overrides/home.html | 43 +- examples/clusters/nccl-rccl-tests/README.md | 2 +- .../distributed-training/axolotl/README.md | 2 +- .../distributed-training/ray-ragen/README.md | 2 +- examples/distributed-training/trl/README.md | 2 +- mkdocs.yml | 25 +- 19 files changed, 325 insertions(+), 382 deletions(-) rename docs/docs/{guides => concepts}/metrics.md (100%) diff --git a/README.md b/README.md index 71d8d1a8b7..bbba8e136a 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ It streamlines development, training, and inference, and is compatible with any hardware, open-source tools, and frameworks. -#### Hardware +#### Accelerators `dstack` supports `NVIDIA`, `AMD`, `Google TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box. @@ -46,7 +46,7 @@ It streamlines development, training, and inference, and is compatible with any ##### Configure backends -To orchestrate compute across cloud providers or existing Kubernetes clusters, you need to configure backends. +To orchestrate compute across GPU clouds or Kubernetes clusters, you need to configure backends. Backends can be set up in `~/.dstack/server/config.yml` or through the [project settings page](https://dstack.ai/docs/concepts/projects#backends) in the UI. @@ -123,12 +123,11 @@ Configuration is updated at ~/.dstack/config.yml `dstack` supports the following configurations: -* [Dev environments](https://dstack.ai/docs/dev-environments) — for interactive development using a desktop IDE -* [Tasks](https://dstack.ai/docs/tasks) — for scheduling jobs (incl. distributed jobs) or running web apps -* [Services](https://dstack.ai/docs/services) — for deployment of models and web apps (with auto-scaling and authorization) -* [Fleets](https://dstack.ai/docs/fleets) — for managing cloud and on-prem clusters +* [Fleets](https://dstack.ai/docs/concepts/fleets) — for managing cloud and on-prem clusters +* [Dev environments](https://dstack.ai/docs/concepts/dev-environments) — for interactive development using a desktop IDE +* [Tasks](https://dstack.ai/docs/concepts/tasks) — for scheduling jobs (incl. distributed jobs) or running web apps +* [Services](https://dstack.ai/docs/concepts/services) — for deployment of models and web apps (with auto-scaling and authorization) * [Volumes](https://dstack.ai/docs/concepts/volumes) — for managing persisted volumes -* [Gateways](https://dstack.ai/docs/concepts/gateways) — for configuring the ingress traffic and public endpoints Configuration can be defined as YAML files within your repo. diff --git a/docs/blog/posts/gpu-health-checks.md b/docs/blog/posts/gpu-health-checks.md index c10557e753..b864e77855 100644 --- a/docs/blog/posts/gpu-health-checks.md +++ b/docs/blog/posts/gpu-health-checks.md @@ -12,7 +12,7 @@ categories: In large-scale training, a single bad GPU can derail progress. Sometimes the failure is obvious — jobs crash outright. Other times it’s subtle: correctable memory errors, intermittent instability, or thermal throttling that quietly drags down throughput. In big experiments, these issues can go unnoticed for hours or days, wasting compute and delaying results. -`dstack` already supports GPU telemetry monitoring through NVIDIA DCGM [metrics](../../docs/guides/metrics.md), covering utilization, memory, and temperature. This release extends that capability with passive hardware health checks powered by DCGM [background health checks](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#background-health-checks). With these, `dstack` continuously evaluates fleet GPUs for hardware reliability and displays their status before scheduling workloads. +`dstack` already supports GPU telemetry monitoring through NVIDIA DCGM [metrics](../../docs/concepts/metrics.md), covering utilization, memory, and temperature. This release extends that capability with passive hardware health checks powered by DCGM [background health checks](https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#background-health-checks). With these, `dstack` continuously evaluates fleet GPUs for hardware reliability and displays their status before scheduling workloads. @@ -69,5 +69,5 @@ If you have experience with GPU reliability or ideas for automated recovery, joi !!! info "What's next?" 1. Check [Quickstart](../../docs/quickstart.md) 2. Explore the [clusters](../../docs/guides/clusters.md) guide - 3. Learn more about [metrics](../../docs/guides/metrics.md) + 3. Learn more about [metrics](../../docs/concepts/metrics.md) 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/metrics-ui.md b/docs/blog/posts/metrics-ui.md index db21cf019a..877ae9fca8 100644 --- a/docs/blog/posts/metrics-ui.md +++ b/docs/blog/posts/metrics-ui.md @@ -53,6 +53,6 @@ For persistent storage and long-term access to metrics, we still recommend setti metrics from `dstack`. !!! info "What's next?" - 1. See [Metrics](../../docs/guides/metrics.md) + 1. See [Metrics](../../docs/concepts/metrics.md) 2. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) 3. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/prometheus.md b/docs/blog/posts/prometheus.md index 8a4d579c04..08aecb4cf5 100644 --- a/docs/blog/posts/prometheus.md +++ b/docs/blog/posts/prometheus.md @@ -45,7 +45,7 @@ Overall, `dstack` collects three groups of metrics: | **Runs** | Run metrics include run counters for each user in each project. | | **Jobs** | A run consists of one or more jobs, each mapped to a container. Job metrics offer insights into execution time, cost, GPU model, NVIDIA DCGM telemetry, and more. | -For a full list of available metrics and labels, check out [Metrics](../../docs/guides/metrics.md). +For a full list of available metrics and labels, check out [Metrics](../../docs/concepts/metrics.md). ??? info "NVIDIA" NVIDIA DCGM metrics are automatically collected for `aws`, `azure`, `gcp`, and `oci` backends, @@ -59,7 +59,7 @@ For a full list of available metrics and labels, check out [Metrics](../../docs/ only accessible through the UI and the [`dstack metrics`](dstack-metrics.md) CLI. !!! info "What's next?" - 1. See [Metrics](../../docs/guides/metrics.md) + 1. See [Metrics](../../docs/concepts/metrics.md) 1. Check [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index 9a1c90ec5c..572d4e0411 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -1,21 +1,22 @@ # Backends -Backends allow `dstack` to provision fleets across cloud providers or Kubernetes clusters. +Backends allow `dstack` to provision fleets across GPU clouds or Kubernetes clusters. `dstack` supports two types of backends: * [VM-based](#vm-based) – use `dstack`'s native integration with cloud providers to provision VMs, manage clusters, and orchestrate container-based runs. * [Container-based](#container-based) – use either `dstack`'s native integration with cloud providers or Kubernetes to orchestrate container-based runs; provisioning in this case is delegated to the cloud provider or Kubernetes. -??? info "SSH fleets" +!!! info "SSH fleets" When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](../concepts/fleets.md#ssh-fleets) once the server is up. Backends can be configured via `~/.dstack/server/config.yml` or through the [project settings page](../concepts/projects.md#backends) in the UI. See the examples of backend configuration below. +> If you update `~/.dstack/server/config.yml`, you have to restart the server. + ## VM-based -VM-based backends allow `dstack` users to manage clusters and orchestrate container-based runs across a wide range of cloud providers. -Under the hood, `dstack` uses native integrations with these providers to provision clusters on demand. +VM-based backends allow `dstack` users to manage clusters and orchestrate container-based runs across a wide range of cloud providers. Under the hood, `dstack` uses native integrations with these providers to provision clusters on demand. Compared to [container-based](#container-based) backends, this approach offers finer-grained, simpler control over cluster provisioning and eliminates the dependency on a Kubernetes layer. @@ -1036,9 +1037,13 @@ projects: No additional setup is required — `dstack` configures and manages the proxy automatically. -??? info "NVIDIA GPU Operator" - For `dstack` to correctly detect GPUs in your Kubernetes cluster, the cluster must have the - [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html) pre-installed. +??? info "Required operators" + === "NVIDIA" + For `dstack` to correctly detect GPUs in your Kubernetes cluster, the cluster must have the + [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html) pre-installed. + === "AMD" + For `dstack` to correctly detect GPUs in your Kubernetes cluster, the cluster must have the + [AMD GPU Operator](https://github.com/ROCm/gpu-operator) pre-installed. 100% +
- FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED - my-fleet - - - - - - -``` + ```shell + $ dstack apply -f fleet.dstack.yml + + # BACKEND REGION RESOURCES SPOT PRICE + 1 gcp us-west4 2xCPU, 8GB, 100GB (disk) yes $0.010052 + 2 azure westeurope 2xCPU, 8GB, 100GB (disk) yes $0.0132 + 3 gcp europe-central2 2xCPU, 8GB, 100GB (disk) yes $0.013248 -
+ Create the fleet? [y/n]: y -If `nodes` is a range that starts above `0`, `dstack` pre-creates the initial number of instances up front, while any additional ones are created on demand. + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + my-fleet 0 gcp (europe-west-1) L4:24GB (spot) $0.1624 idle 3 mins ago + 1 gcp (europe-west-1) L4:24GB (spot) $0.1624 idle 3 mins ago + ``` -> Setting the `nodes` range to start above `0` is supported only for [VM-based backends](backends.md#vm-based). + -??? info "Target number of nodes" + If the `nodes` range starts with `0`, `dstack apply` creates only a template. Instances are provisioned only when you submit runs. - If `nodes` is defined as a range, you can start with more than the minimum number of instances by using the `target` parameter when creating the fleet. +=== "SSH fleets" + If you have a group of on-prem servers accessible via SSH, you can create an SSH fleet as follows:
- + ```yaml type: fleet - name: my-fleet + + # Uncomment if instances are interconnected + #placement: cluster - nodes: - min: 0 - max: 2 - - # Provision 2 instances initially - target: 2 - - # Deprovision instances above the minimum if they remain idle - idle_duration: 1h + ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - 3.255.177.51 + - 3.255.177.52 ``` - +
-By default, when you submit a [dev environment](dev-environments.md), [task](tasks.md), or [service](services.md), `dstack` tries all available fleets. However, you can explicitly specify the [`fleets`](../reference/dstack.yml/dev-environment.md#fleets) in your run configuration -or via [`--fleet`](../reference/cli/dstack/apply.md#fleet) with `dstack apply`. + Pass the fleet configuration to `dstack apply`: -### Configuration options +
-#### Placement { #backend-placement } + ```shell + $ dstack apply -f fleet.dstack.yml + + Provisioning... + ---> 100% -To ensure instances are interconnected (e.g., for -[distributed tasks](tasks.md#distributed-tasks)), set `placement` to `cluster`. -This ensures all instances are provisioned with optimal inter-node connectivity. + FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED + my-fleet 0 ssh (remote) L4:24GB $0 idle 3 mins ago + 1 ssh (remote) L4:24GB $0 idle 3 mins ago + ``` -??? info "AWS" - When you create a fleet with AWS, [Elastic Fabric Adapter networking](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html) is automatically configured if it’s supported for the corresponding instance type. - Note, EFA requires the `public_ips` to be set to `false` in the `aws` backend configuration. - Otherwise, instances are only connected by the default VPC subnet. +
- Refer to the [AWS](../../examples/clusters/aws/index.md) example for more details. + `dstack apply` automatically connects to on-prem servers, installs the required dependencies, and adds them to the created fleet. -??? info "GCP" - When you create a fleet with GCP, `dstack` automatically configures [GPUDirect-TCPXO and GPUDirect-TCPX](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot) networking for the A3 Mega and A3 High instance types, as well as RoCE networking for the A4 instance type. + ??? info "Host requirements" + 1. Hosts must be pre-installed with Docker. - !!! info "Backend configuration" - You may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration. - Refer to the [GCP](../../examples/clusters/gcp/index.md) examples for more details. + === "NVIDIA" + 2. Hosts with NVIDIA GPUs must also be pre-installed with CUDA 12.1 and + [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). -??? info "Nebius" - When you create a fleet with Nebius, [InfiniBand networking](https://docs.nebius.com/compute/clusters/gpu) is automatically configured if it’s supported for the corresponding instance type. - Otherwise, instances are only connected by the default VPC subnet. + === "AMD" + 2. Hosts with AMD GPUs must also be pre-installed with AMDGPU-DKMS kernel driver (e.g. via + [native package manager](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/native-install/index.html) + or [AMDGPU installer](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/amdgpu-install.html).) - An InfiniBand fabric for the cluster is selected automatically. If you prefer to use some specific fabrics, configure them in the - [backend settings](../reference/server/config.yml.md#nebius). + === "Intel Gaudi" + 2. Hosts with Intel Gaudi accelerators must be pre-installed with [Gaudi software and drivers](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html#driver-installation). + This must include the drivers, `hl-smi`, and Habana Container Runtime. -The `cluster` placement is supported for `aws`, `azure`, `gcp`, `nebius`, `oci`, and `vultr` -backends. + === "Tenstorrent" + 2. Hosts with Tenstorrent accelerators must be pre-installed with [Tenstorrent software](https://docs.tenstorrent.com/getting-started/README.html#software-installation). + This must include the drivers, `tt-smi`, and HugePages. -> For more details on optimal inter-node connectivity, read the [Clusters](../guides/clusters.md) guide. + 3. The user specified must have passwordless `sudo` access. - + 4. The SSH server must be running and configured with `AllowTcpForwarding yes` in `/etc/ssh/sshd_config`. -#### Resources + 5. The firewall must allow SSH and should forbid any other connections from external networks. For `placement: cluster` fleets, it should also allow any communication between fleet nodes. -When you specify a resource value like `cpu` or `memory`, -you can either use an exact value (e.g. `24GB`) or a -range (e.g. `24GB..`, or `24GB..80GB`, or `..80GB`). +> Once the fleet is created, you can run [dev environments](dev-environments.md), [tasks](tasks.md), and [services](services.md). -
+## Configuration options -```yaml -type: fleet -# The name is optional, if not specified, generated randomly -name: my-fleet +Backend fleets support [many options](../reference/dstack.yml/fleet.md); see some major configuration examples below. -nodes: 2 +### Cluster placement -resources: - # 200GB or more RAM - memory: 200GB.. - # 4 GPUs from 40GB to 80GB - gpu: 40GB..80GB:4 - # Disk size - disk: 500GB -``` +Both [backend fleets](#backend-fleet) and [SSH fleets](#ssh-fleet) allow the `placement` property to be set to `cluster`. -
- -The `gpu` property allows specifying not only memory size but also GPU vendor, names -and their quantity. Examples: `nvidia` (one NVIDIA GPU), `A100` (one A100), `A10G,A100` (either A10G or A100), -`A100:80GB` (one A100 of 80GB), `A100:2` (two A100), `24GB..40GB:2` (two GPUs between 24GB and 40GB), -`A100:40GB:2` (two A100 GPUs of 40GB). +This property ensures that instances are interconnected. This is required for running [distributed tasks](tasks.md#distributed-tasks). -??? info "Google Cloud TPU" - To use TPUs, specify its architecture via the `gpu` property. +=== "Backend fleets" + Backend fleets allow to provision interconnected clusters across supported backends. +
+ ```yaml type: fleet - # The name is optional, if not specified, generated randomly name: my-fleet nodes: 2 - + placement: cluster + resources: - gpu: v2-8 + gpu: H100:8 ``` + +
- Currently, only 8 TPU cores can be specified, supporting single TPU device workloads. Multi-TPU support is coming soon. - -> If you’re unsure which offers (hardware configurations) are available from the configured backends, use the -> [`dstack offer`](../reference/cli/dstack/offer.md#list-gpu-offers) command to list them. - -#### Blocks { #backend-blocks } - -For backend fleets, `blocks` function the same way as in SSH fleets. -See the [`Blocks`](#ssh-blocks) section under SSH fleets for details on the blocks concept. - -
- -```yaml -type: fleet - -name: my-fleet + #### Backends -resources: - gpu: NVIDIA:80GB:8 + Fast interconnect is supported on the `aws`, `gcp`, `nebius`, `kubernetes`, and `runpod` backends. Some backends may require additional configuration. -# Split into 4 blocks, each with 2 GPUs -blocks: 4 -``` + === "AWS" + On AWS, `dstack` requires `public_ips` to be set to `false` in the backend configuration. + Refer to the [AWS](../../examples/clusters/aws/index.md) example for more details. -
+ === "GCP" + On GCP, you may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration. + Refer to the [GCP](../../examples/clusters/gcp/index.md) examples for more details. -#### Idle duration + === "Nebius" + On [Nebius](https://docs.nebius.com/compute/clusters/gpu), `dstack` automatically configures InfiniBand networking if it is supported by the selected instance type. -By default, fleet instances stay `idle` for 3 days and can be reused within that time. -If an instance is not reused within this period, it is automatically terminated. + === "Kubernetes" + If the Kubernetes cluster has interconnect configured, `dstack` can use it without additional setup. + See the [Lambda](../../examples/clusters/lambda/index.md#kubernetes) or [Crusoe](../../examples/clusters/crusoe/index.md#kubernetes) examples. + + === "Runpod" + On [Runpod](https://docs.runpod.io/instant-clusters), `dstack` automatically configures InfiniBand networking if it is supported by the selected instance type. + + > See the [Clusters](../../examples.md#clusters) examples. -To change the default idle duration, set -[`idle_duration`](../reference/dstack.yml/fleet.md#idle_duration) in the fleet configuration (e.g., `0s`, `1m`, or `off` for -unlimited). +=== "SSH fleets" + If the hosts in the SSH fleet have interconnect configured, you only need to set `placement` to `cluster`. -
- +
+ ```yaml type: fleet - # The name is optional, if not specified, generated randomly name: my-fleet - - nodes: 2 - # Terminate instances idle for more than 1 hour - idle_duration: 1h - - resources: - gpu: 24GB - ``` - -
+ placement: cluster -#### Spot policy + ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - 3.255.177.51 + - 3.255.177.52 + ``` + +
-By default, `dstack` uses on-demand instances. However, you can change that -via the [`spot_policy`](../reference/dstack.yml/fleet.md#spot_policy) property. It accepts `spot`, `on-demand`, and `auto`. + !!! info "Network" + By default, `dstack` automatically detects the network shared by the hosts. However, it's possible to configure it explicitly via the [`network`](../reference/dstack.yml/fleet.md#network) property. -#### Retry policy + -By default, if `dstack` fails to provision an instance or an instance is interrupted, no retry is attempted. +### Nodes -If you'd like `dstack` to do it, configure the -[retry](../reference/dstack.yml/fleet.md#retry) property accordingly: +The `nodes` property is supported only by backend fleets and specifies how many nodes `dstack` must or can provision. -
+
```yaml type: fleet -# The name is optional, if not specified, generated randomly name: my-fleet -nodes: 1 +# Allow to provision of up to 2 instances +nodes: 0..2 -resources: - gpu: 24GB +# Uncomment to ensure instances are inter-connected +#placement: cluster + +# Deprovision instances above the minimum if they remain idle +idle_duration: 1h -retry: - # Retry on specific events - on_events: [no-capacity, interruption] - # Retry for up to 1 hour - duration: 1h +resources: + # Allow to provision up to 8 GPUs + gpu: 0..8 ```
-!!! info "Reference" - Backend fleets support many more configuration options, - incl. [`backends`](../reference/dstack.yml/fleet.md#backends), - [`regions`](../reference/dstack.yml/fleet.md#regions), - [`max_price`](../reference/dstack.yml/fleet.md#max_price), and - among [others](../reference/dstack.yml/fleet.md). +#### Pre-provisioning -## SSH fleets +If the `nodes` range starts with `0`, `dstack apply` creates only a template, and instances are provisioned when you submit runs. -If you have a group of on-prem servers accessible via SSH, you can create an SSH fleet. +To provision instances up front, set the `nodes` range to start above `0`. This pre-creates the initial number of instances; additional instances (if any) are provisioned on demand. -### Apply a configuration -Define a fleet configuration as a YAML file in your project directory. The file must have a -`.dstack.yml` extension (e.g. `.dstack.yml` or `fleet.dstack.yml`). +
+ + ```yaml + type: fleet + name: my-fleet + + nodes: 2..10 -
+ # Uncomment to ensure instances are inter-connected + #placement: cluster + resources: + gpu: H100:8 + ``` + +
+ +Pre-provisioning is supported only for [VM-based backends](backends.md#vm-based). + +??? info "Target number" + To pre-provision more than the minimum number of instances, set the `target` parameter. + +
+ ```yaml type: fleet - # The name is optional, if not specified, generated randomly name: my-fleet - # Uncomment if instances are interconnected - #placement: cluster + nodes: + min: 2 + max: 10 + target: 6 - # SSH credentials for the on-prem servers - ssh_config: - user: ubuntu - identity_file: ~/.ssh/id_rsa - hosts: - - 3.255.177.51 - - 3.255.177.52 + # Deprovision instances above the minimum if they remain idle + idle_duration: 1h ``` - -
-??? info "Requirements" - 1. Hosts must be pre-installed with Docker. +
- === "NVIDIA" - 2. Hosts with NVIDIA GPUs must also be pre-installed with CUDA 12.1 and - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). + `dstack apply` pre-provisions up to `target` and scales back to `min` after `idle_duration`. - === "AMD" - 2. Hosts with AMD GPUs must also be pre-installed with AMDGPU-DKMS kernel driver (e.g. via - [native package manager](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/native-install/index.html) - or [AMDGPU installer](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/amdgpu-install.html).) +### Resources - === "Intel Gaudi" - 2. Hosts with Intel Gaudi accelerators must be pre-installed with [Gaudi software and drivers](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html#driver-installation). - This must include the drivers, `hl-smi`, and Habana Container Runtime. +Backend fleets allow you to specify the resource requirements for the instances to be provisioned. The `resources` property syntax is the same as for [run configurations](dev-environments.md#resources). - === "Tenstorrent" - 2. Hosts with Tenstorrent accelerators must be pre-installed with [Tenstorrent software](https://docs.tenstorrent.com/getting-started/README.html#software-installation). - This must include the drivers, `tt-smi`, and HugePages. +> Not directly related, but in addition to `resources`, you can specify [`spot_policy`](../reference/dstack.yml/fleet.md#instance_types), [`instance_types`](../reference/dstack.yml/fleet.md#instance_types), [`max_price`](../reference/dstack.yml/fleet.md#max_price), [`region`](../reference/dstack.yml/fleet.md#max_price), and other [options](../reference/dstack.yml/fleet.md#). - 3. The user specified must have passwordless `sudo` access. + - 4. The SSH server must be running and configured with `AllowTcpForwarding yes` in `/etc/ssh/sshd_config`. +### Backends - 5. The firewall must allow SSH and should forbid any other connections from external networks. For `placement: cluster` fleets, it should also allow any communication between fleet nodes. +### Idle duration -To create or update the fleet, pass the fleet configuration to [`dstack apply`](../reference/cli/dstack/apply.md): +By default, instances of a backend fleet stay `idle` for 3 days and can be reused within that time. +If an instance is not reused within this period, it is automatically terminated. -
+To change the default idle duration, set +[`idle_duration`](../reference/dstack.yml/fleet.md#idle_duration) in the fleet configuration (e.g., `0s`, `1m`, or `off` for +unlimited). -```shell -$ dstack apply -f examples/misc/fleets/.dstack.yml +
+ +```yaml +type: fleet +name: my-fleet -Provisioning... ----> 100% +nodes: 2 + +# Terminate instances idle for more than 1 hour +idle_duration: 1h - FLEET INSTANCE GPU PRICE STATUS CREATED - my-fleet 0 L4:24GB (spot) $0 idle 3 mins ago - 1 L4:24GB (spot) $0 idle 3 mins ago +resources: + gpu: 24GB ```
-When you apply, `dstack` connects to the specified hosts using the provided SSH credentials, -installs the dependencies, and configures these hosts as a fleet. - -Once the status of instances changes to `idle`, they can be used by dev environments, tasks, and services. - -### Configuration options +### Blocks -#### Placement { #ssh-placement } +By default, a job uses the entire instance—e.g., all 8 GPUs. To allow multiple jobs on the same instance, set the `blocks` property to divide the instance. Each job can then use one or more blocks, up to the full instance. -If the hosts are interconnected (i.e. share the same network), set `placement` to `cluster`. -This is required if you'd like to use the fleet for [distributed tasks](tasks.md#distributed-tasks). +=== "Backend fleets" +
-??? info "Network" - By default, `dstack` automatically detects the network shared by the hosts. - However, it's possible to configure it explicitly via - the [`network`](../reference/dstack.yml/fleet.md#network) property. + ```yaml + type: fleet + name: my-fleet - [//]: # (TODO: Provide an example and more detail) + nodes: 0..2 -> For more details on optimal inter-node connectivity, read the [Clusters](../guides/clusters.md) guide. + resources: + gpu: H100:8 -#### Blocks { #ssh-blocks } + # Split into 4 blocks, each with 2 GPUs + blocks: 4 + ``` -By default, a job uses the entire instance—e.g., all 8 GPUs. To allow multiple jobs on the same instance, set the `blocks` property to divide the instance. Each job can then use one or more blocks, up to the full instance. +
-
+=== "SSH fleets" +
```yaml type: fleet @@ -386,7 +356,7 @@ By default, a job uses the entire instance—e.g., all 8 GPUs. To allow multiple blocks: 1 ``` -
+
All resources (GPU, CPU, memory) are split evenly across blocks, while disk is shared. @@ -396,37 +366,16 @@ Set `blocks` to `auto` to match the number of blocks to the number of GPUs. !!! info "Distributed tasks" Distributed tasks require exclusive access to all host resources and therefore must use all blocks on each node. - -#### Environment variables - -If needed, you can specify environment variables that will be used by `dstack-shim` and passed to containers. - -[//]: # (TODO: Explain what dstack-shim is) - -For example, these variables can be used to configure a proxy: - -```yaml -type: fleet -name: my-fleet -env: - - HTTP_PROXY=http://proxy.example.com:80 - - HTTPS_PROXY=http://proxy.example.com:80 - - NO_PROXY=localhost,127.0.0.1 +### SSH config -ssh_config: - user: ubuntu - identity_file: ~/.ssh/id_rsa - hosts: - - 3.255.177.51 - - 3.255.177.52 -``` + #### Proxy jump -If fleet hosts are behind a head node (aka "login node"), configure [`proxy_jump`](../reference/dstack.yml/fleet.md#proxy_jump): +If hosts are behind a head node (aka "login node"), configure [`proxy_jump`](../reference/dstack.yml/fleet.md#proxy_jump): -
+
```yaml type: fleet @@ -446,8 +395,7 @@ If fleet hosts are behind a head node (aka "login node"), configure [`proxy_jump
-To be able to attach to runs, both explicitly with `dstack attach` and implicitly with `dstack apply`, you must either -add a front node key (`~/.ssh/head_node_key`) to an SSH agent or configure a key path in `~/.ssh/config`: +To be able to attach to runs, both explicitly with `dstack attach` and implicitly with `dstack apply`, you must either add a front node key (`~/.ssh/head_node_key`) to an SSH agent or configure a key path in `~/.ssh/config`:
@@ -458,22 +406,33 @@ add a front node key (`~/.ssh/head_node_key`) to an SSH agent or configure a key
-where `Host` must match `ssh_config.proxy_jump.hostname` or `ssh_config.hosts[n].proxy_jump.hostname` if you configure head nodes -on a per-worker basis. +where `Host` must match `ssh_config.proxy_jump.hostname` or `ssh_config.hosts[n].proxy_jump.hostname` if you configure head nodes on a per-worker basis. -!!! info "Reference" - For all SSH fleet configuration options, refer to the [reference](../reference/dstack.yml/fleet.md). +### Environment variables -#### Troubleshooting +If needed, you can specify environment variables that will be automatically passed to any jobs running on this fleet. + +For example, these variables can be used to configure a proxy: + +```yaml +type: fleet +name: my-fleet -!!! info "Resources" - Once the fleet is created, double-check that the GPU, memory, and disk are detected correctly. +env: + - HTTP_PROXY=http://proxy.example.com:80 + - HTTPS_PROXY=http://proxy.example.com:80 + - NO_PROXY=localhost,127.0.0.1 -If the status does not change to `idle` after a few minutes or the resources are not displayed correctly, ensure that -all host requirements are satisfied. +ssh_config: + user: ubuntu + identity_file: ~/.ssh/id_rsa + hosts: + - 3.255.177.51 + - 3.255.177.52 +``` -If the requirements are met but the fleet still fails to be created correctly, check the logs at -`/root/.dstack/shim.log` on the hosts for error details. +!!! info "Reference" + The fleet configuration file supports many more options. See the [reference](../reference/dstack.yml/fleet.md). ## Manage fleets @@ -513,4 +472,6 @@ To terminate and delete specific instances from a fleet, pass `-i INSTANCE_NUM`. !!! info "What's next?" 1. Check [dev environments](dev-environments.md), [tasks](tasks.md), and [services](services.md) - 2. Read the [Clusters](../guides/clusters.md) guide + 2. Read about [Backends](backends.md) guide + 3. Explore the [`.dstack.yml` reference](../reference/dstack.yml/fleet.md) + 4. See the [Clusters](../../examples.md#clusters) example diff --git a/docs/docs/guides/metrics.md b/docs/docs/concepts/metrics.md similarity index 100% rename from docs/docs/guides/metrics.md rename to docs/docs/concepts/metrics.md diff --git a/docs/docs/concepts/tasks.md b/docs/docs/concepts/tasks.md index ac94415d4d..6f3f2fabb7 100644 --- a/docs/docs/concepts/tasks.md +++ b/docs/docs/concepts/tasks.md @@ -135,18 +135,18 @@ resources:
-Nodes can communicate using their private IP addresses. -Use `DSTACK_MASTER_NODE_IP`, `DSTACK_NODES_IPS`, `DSTACK_NODE_RANK`, and other -[System environment variables](#system-environment-variables) for inter-node communication. - -`dstack` is easy to use with `accelerate`, `torchrun`, Ray, Spark, and any other distributed frameworks. +!!! info "Cluster placement" + To submit a distributed task, you must create at least one fleet with a [cluster placement](fleets.md#cluster-placement). + +Jobs on each node communicate using their private IP addresses. Use `DSTACK_MASTER_NODE_IP`, `DSTACK_NODES_IPS`, `DSTACK_NODE_RANK`, and other [system environment variables](#system-environment-variables) for inter-node communication. -!!! info "MPI" - If want to use MPI, you can set `startup_order` to `workers-first` and `stop_criteria` to `master-done`, and use `DSTACK_MPI_HOSTFILE`. - See the [NCCL/RCCL tests](../../examples/clusters/nccl-rccl-tests/index.md) examples. + -> For detailed examples, see [distributed training](../../examples.md#distributed-training) examples. +`dstack` is easy to use with `accelerate`, `torchrun`, Ray, Spark, and any other distributed frameworks. + +> For detailed examples, see the [distributed training](../../examples.md#distributed-training) + and [clusters](../../examples.md#clusters) examples. ??? info "Network interface" Distributed frameworks usually detect the correct network interface automatically, @@ -172,11 +172,6 @@ Use `DSTACK_MASTER_NODE_IP`, `DSTACK_NODES_IPS`, `DSTACK_NODE_RANK`, and other For convenience, `~/.ssh/config` is preconfigured with these options, so a simple `ssh ` is enough. For a list of nodes IPs check the `DSTACK_NODES_IPS` environment variable. -!!! info "Cluster fleets" - To run distributed tasks, you need to create a fleet with [`placement: cluster`](fleets.md#cloud-placement). - -> See the [Clusters](../guides/clusters.md) guide for more details on how to use `dstack` on clusters. - ### Resources When you specify a resource value like `cpu` or `memory`, diff --git a/docs/docs/guides/protips.md b/docs/docs/guides/protips.md index 167b8f1b4b..dfb7abf0b6 100644 --- a/docs/docs/guides/protips.md +++ b/docs/docs/guides/protips.md @@ -482,7 +482,7 @@ The `offer` command allows you to filter and group offers with various [advanced ## Metrics -`dstack` tracks essential metrics accessible via the CLI and UI. To access advanced metrics like DCGM, configure the server to export metrics to Prometheus. See [Metrics](metrics.md) for details. +`dstack` tracks essential metrics accessible via the CLI and UI. To access advanced metrics like DCGM, configure the server to export metrics to Prometheus. See [Metrics](../concepts/metrics.md) for details. ## Service quotas diff --git a/docs/docs/guides/troubleshooting.md b/docs/docs/guides/troubleshooting.md index 5d17b894d0..2b4356fb5c 100644 --- a/docs/docs/guides/troubleshooting.md +++ b/docs/docs/guides/troubleshooting.md @@ -119,7 +119,7 @@ one of these features, `dstack` will only select offers from the backends that s [Instance volumes](../concepts/volumes.md#instance-volumes), and [Privileged containers](../reference/dstack.yml/dev-environment.md#privileged) are supported by all backends except `runpod`, `vastai`, and `kubernetes`. -- [Clusters](../concepts/fleets.md#cloud-placement) +- [Clusters](../concepts/fleets.md#cluster-placement) and [distributed tasks](../concepts/tasks.md#distributed-tasks) are only supported by the `aws`, `azure`, `gcp`, `nebius`, `oci`, and `vultr` backends, as well as SSH fleets. diff --git a/docs/docs/index.md b/docs/docs/index.md index b0228fb2c9..121a379150 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -4,9 +4,8 @@ It streamlines development, training, and inference, and is compatible with any hardware, open-source tools, and frameworks. -#### Hardware - -`dstack` supports `NVIDIA`, `AMD`, `TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box. +!!! info "Accelerators" + `dstack` supports `NVIDIA`, `AMD`, `TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box. ## How does it work? @@ -20,12 +19,11 @@ It streamlines development, training, and inference, and is compatible with any `dstack` supports the following configurations: +* [Fleets](concepts/fleets.md) — for managing cloud and on-prem clusters * [Dev environments](concepts/dev-environments.md) — for interactive development using a desktop IDE * [Tasks](concepts/tasks.md) — for scheduling jobs, incl. distributed ones (or running web apps) * [Services](concepts/services.md) — for deploying models (or web apps) -* [Fleets](concepts/fleets.md) — for managing cloud and on-prem clusters * [Volumes](concepts/volumes.md) — for managing network volumes (to persist data) -* [Gateways](concepts/gateways.md) — for publishing services with a custom domain and HTTPS Configuration can be defined as YAML files within your repo. diff --git a/docs/docs/installation/index.md b/docs/docs/installation/index.md index aad8741b66..e179a8e663 100644 --- a/docs/docs/installation/index.md +++ b/docs/docs/installation/index.md @@ -6,15 +6,6 @@ ## Set up the server -### Configure backends - -To orchestrate compute across cloud providers or Kubernetes clusters, you need to configure [backends](../concepts/backends.md). - -??? info "SSH fleets" - When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](../concepts/fleets.md#ssh-fleets) once the server is up. - -### Start the server - The server can run on your laptop or any environment with access to the cloud and on-prem clusters you plan to use. === "uv" @@ -72,16 +63,17 @@ The server can run on your laptop or any environment with access to the cloud an
-To verify that backends are properly configured, use the [`dstack offer`](../reference/cli/dstack/offer.md#list-gpu-offers) command to list available GPU offers. +For more details on server deployment options, see the [Server deployment](../guides/server-deployment.md) guide. -!!! info "Server deployment" - For more details on server deployment options, see the [Server deployment](../guides/server-deployment.md) guide. +### Configure backends + +> To orchestrate compute across GPU clouds or Kubernetes clusters, you need to configure [backends](../concepts/backends.md). ## Set up the CLI Once the server is up, you can access it via the `dstack` CLI. -> The CLI can be set up via `pip` or `uv` on Linux, macOS, and Windows. It requires Git and OpenSSH. +> The CLI can be used on Linux, macOS, and Windows. It requires Git and OpenSSH. === "uv" @@ -105,13 +97,15 @@ Once the server is up, you can access it via the `dstack` CLI. ??? info "Windows" To use the CLI on Windows, ensure you've installed Git and OpenSSH via - [Git for Windows:material-arrow-top-right-thin:{ .external }](https://git-scm.com/download/win). + [Git for Windows](https://git-scm.com/download/win). When installing it, ensure you've checked `Git from the command line and also from 3-rd party software` (or `Use Git and optional Unix tools from the Command Prompt`), and `Use bundled OpenSSH`. +### Configure the default project + To point the CLI to the `dstack` server, configure it with the server address, user token, and project name: @@ -130,6 +124,10 @@ Configuration is updated at ~/.dstack/config.yml This configuration is stored in `~/.dstack/config.yml`. +### Check offers + +To verify that both the server and CLI are properly configured, use the [`dstack offer`](../reference/cli/dstack/offer.md#list-gpu-offers) command to list available GPU offers. If you don't see valid offers, ensure you've set up [backends](../concepts/backends.md). + ??? info "Shell autocompletion" `dstack` supports shell autocompletion for `bash` and `zsh`. @@ -195,11 +193,10 @@ This configuration is stored in `~/.dstack/config.yml`. > If you get an error similar to `2: command not found: compdef`, then add the following line to the beginning of your `~/.zshrc` file: > `autoload -Uz compinit && compinit`. - !!! info "What's next?" - 1. Follow [Quickstart](../quickstart.md) - 2. See [Backends](../concepts/backends.md) + 1. See [Backends](../concepts/backends.md) + 2. Follow [Quickstart](../quickstart.md) 3. Check the [server deployment](../guides/server-deployment.md) guide 4. Browse [examples](../../examples.md) 5. Join the community via [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md index 759ec1b573..2a8d3f0610 100644 --- a/docs/docs/quickstart.md +++ b/docs/docs/quickstart.md @@ -1,11 +1,11 @@ # Quickstart -??? info "Prerequsites" +!!! info "Prerequsites" Before using `dstack`, ensure you've [installed](installation/index.md) the server and the CLI. ## Create a fleet -Before you can submit your first run, you have to create a [fleet](concepts/fleets.md). +> Before submitting runs, you must create a [fleet](concepts/fleets.md). === "Backend fleet" If you're using cloud providers or Kubernetes clusters and have configured the corresponding [backends](concepts/backends.md), create a fleet as follows: @@ -49,11 +49,9 @@ Before you can submit your first run, you have to create a [fleet](concepts/flee
- If `nodes` is a range that starts above `0`, `dstack` pre-creates the initial number of instances up front, while any additional ones are created on demand. - - > Setting the `nodes` range to start above `0` is supported only for [VM-based backends](concepts/backends.md#vm-based). + If the `nodes` range starts with `0`, `dstack apply` creates only a template. Instances are provisioned only when you submit runs. - If the fleet needs to be a cluster, the [placement](concepts/fleets.md#backend-placement) property must be set to `cluster`. + If the fleet needs to be a cluster, the [placement](concepts/fleets.md#cluster-placement) property must be set to `cluster`. === "SSH fleet" If you have a group of on-prem servers accessible via SSH, you can create an SSH fleet as follows: diff --git a/docs/overrides/home.html b/docs/overrides/home.html index ced53fb1e8..7cebed7b6a 100644 --- a/docs/overrides/home.html +++ b/docs/overrides/home.html @@ -190,15 +190,6 @@

Native integration with GPU clouds

fill-rule="nonzero" fill="currentColor" class="fill-main"> - - - Kubernetes - - - -

@@ -217,17 +208,17 @@

Easy to use with on-prem clusters

- - Kubernetes + + SSH fleets - - - SSH fleets + + + Kubernetes Single-node & distributed tasks Tasks - - - Clusters - -

@@ -320,7 +305,7 @@

Single-node & distributed tasks

-

Scalable service endpoints

+

Scalable model inference

With dstack, you can easily deploy any model as a secure, @@ -334,6 +319,12 @@

Scalable service endpoints

Services + + + Gateways + +

@@ -596,10 +587,12 @@

Get started in minutes

- + + Installation + +

diff --git a/examples/clusters/nccl-rccl-tests/README.md b/examples/clusters/nccl-rccl-tests/README.md index 7248a4f422..0a2a138b1a 100644 --- a/examples/clusters/nccl-rccl-tests/README.md +++ b/examples/clusters/nccl-rccl-tests/README.md @@ -3,7 +3,7 @@ This example shows how to run [NCCL](https://github.com/NVIDIA/nccl-tests) or [RCCL](https://github.com/ROCm/rccl-tests) tests on a cluster using [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks). !!! info "Prerequisites" - Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#backend-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#cluster-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). ## Running as a task diff --git a/examples/distributed-training/axolotl/README.md b/examples/distributed-training/axolotl/README.md index 9ddd77a363..1454732ad5 100644 --- a/examples/distributed-training/axolotl/README.md +++ b/examples/distributed-training/axolotl/README.md @@ -3,7 +3,7 @@ This example walks you through how to run distributed fine-tune using [Axolotl](https://github.com/axolotl-ai-cloud/axolotl) and [distributed tasks](https://dstack.ai/docs/concepts/tasks#distributed-tasks). !!! info "Prerequisites" - Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#backend-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#cluster-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). ## Define a configuration diff --git a/examples/distributed-training/ray-ragen/README.md b/examples/distributed-training/ray-ragen/README.md index e79f27f788..32ce0173fd 100644 --- a/examples/distributed-training/ray-ragen/README.md +++ b/examples/distributed-training/ray-ragen/README.md @@ -6,7 +6,7 @@ to fine-tune an agent on multiple nodes. Under the hood `RAGEN` uses [verl](https://github.com/volcengine/verl) for Reinforcement Learning and [Ray](https://docs.ray.io/en/latest/) for distributed training. !!! info "Prerequisites" - Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#backend-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#cluster-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). ## Run a Ray cluster diff --git a/examples/distributed-training/trl/README.md b/examples/distributed-training/trl/README.md index c6231e5170..9df482da52 100644 --- a/examples/distributed-training/trl/README.md +++ b/examples/distributed-training/trl/README.md @@ -3,7 +3,7 @@ This example walks you through how to run distributed fine-tune using [TRL](https://github.com/huggingface/trl), [Accelerate](https://github.com/huggingface/accelerate) and [Deepspeed](https://github.com/deepspeedai/DeepSpeed). !!! info "Prerequisites" - Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#backend-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). + Before running a distributed task, make sure to create a fleet with `placement` set to `cluster` (can be a [managed fleet](https://dstack.ai/docs/concepts/fleets#cluster-placement) or an [SSH fleet](https://dstack.ai/docs/concepts/fleets#ssh-placement)). ## Define a configuration diff --git a/mkdocs.yml b/mkdocs.yml index 07eed5f3b7..5dcdd69999 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -74,15 +74,13 @@ plugins: - docs/concepts/tasks.md: How to run tasks - for training or fine-tuning, including distributed tasks - docs/concepts/services.md: How to deploy services - for model inference or web apps - docs/concepts/volumes.md: How to manage volumes - for persistent storage or caching + - docs/concepts/gateways.md: How to manage gateways - enabling auto-scaling, rate limits, and custom domains - docs/concepts/secrets.md: How to manage secrets - for API keys or other sensitive data - docs/concepts/projects.md: How to manage projects - for managing separate teams - - docs/concepts/gateways.md: How to manage gateways - enabling auto-scaling, rate limits, and custom domains + - docs/concepts/metrics.md: How to manage gateways - enabling auto-scaling, rate limits, and custom domains Guides: - - docs/guides/clusters.md: How to work with clusters - for distributed tasks - - docs/guides/kubernetes.md: How to work with Kubernetes - docs/guides/server-deployment.md: Detailed guide on how to deploy the dstack server - docs/guides/troubleshooting.md: Common issues and how to troubleshoot them - - docs/guides/metrics.md: How to monitor metrics - docs/guides/protips.md: Pro tips - tips and tricks to use dstack more efficiently Examples: - examples/single-node-training/trl/index.md: TRL @@ -149,8 +147,8 @@ plugins: "blog/data-centers-and-private-clouds.md": "blog/posts/gpu-blocks-and-proxy-jump.md" "blog/distributed-training-with-aws-efa.md": "examples/clusters/aws/index.md" "blog/dstack-stats.md": "blog/posts/dstack-metrics.md" - "docs/concepts/metrics.md": "docs/guides/metrics.md" - "docs/guides/monitoring.md": "docs/guides/metrics.md" + "docs/guides/metrics.md": "docs/concepts/metrics.md" + "docs/guides/monitoring.md": "docs/concepts/metrics.md" "blog/nvidia-and-amd-on-vultr.md.md": "blog/posts/nvidia-and-amd-on-vultr.md" "examples/misc/nccl-tests/index.md": "examples/clusters/nccl-rccl-tests/index.md" "examples/misc/a3high-clusters/index.md": "examples/clusters/gcp/index.md" @@ -268,19 +266,18 @@ nav: - Tasks: docs/concepts/tasks.md - Services: docs/concepts/services.md - Volumes: docs/concepts/volumes.md - - Secrets: docs/concepts/secrets.md - - Projects: docs/concepts/projects.md - - Gateways: docs/concepts/gateways.md + - More: + - Gateways: docs/concepts/gateways.md + - Secrets: docs/concepts/secrets.md + - Projects: docs/concepts/projects.md + - Metrics: docs/concepts/metrics.md - Guides: - - Clusters: docs/guides/clusters.md - - Kubernetes: docs/guides/kubernetes.md - Server deployment: docs/guides/server-deployment.md - Troubleshooting: docs/guides/troubleshooting.md - - Metrics: docs/guides/metrics.md - Protips: docs/guides/protips.md - Upgrade: docs/guides/upgrade.md - - Migration: - - Slurm: docs/guides/migration/slurm.md + - Migration: + - Slurm: docs/guides/migration/slurm.md - Reference: - .dstack.yml: - dev-environment: docs/reference/dstack.yml/dev-environment.md From b93476e187450ecbf0ac729a5c195d278580e1b3 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:36:58 +0000 Subject: [PATCH 086/187] Support secret events in API, CLI, and UI (#3504) --- .../Events/List/hooks/useColumnDefinitions.tsx | 13 +++++++++++++ .../src/pages/Events/List/hooks/useFilters.ts | 9 +++++++++ frontend/src/types/event.d.ts | 5 +++-- src/dstack/_internal/cli/commands/event.py | 11 +++++++++++ src/dstack/_internal/cli/services/events.py | 1 + src/dstack/_internal/core/models/events.py | 1 + src/dstack/_internal/server/routers/events.py | 1 + src/dstack/_internal/server/schemas/events.py | 11 +++++++++++ src/dstack/_internal/server/services/events.py | 17 +++++++++++++++++ src/dstack/api/server/_events.py | 2 ++ 10 files changed, 69 insertions(+), 2 deletions(-) diff --git a/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx b/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx index d6e5b846ea..bc9a07aa0c 100644 --- a/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx +++ b/frontend/src/pages/Events/List/hooks/useColumnDefinitions.tsx @@ -138,6 +138,19 @@ export const useColumnsDefinitions = () => { ); + case 'secret': + return ( +
+ Secret{' '} + {target.project_name && ( + + {target.project_name} + + )} + /{target.name} +
+ ); + default: return '---'; } diff --git a/frontend/src/pages/Events/List/hooks/useFilters.ts b/frontend/src/pages/Events/List/hooks/useFilters.ts index d463770b30..e6a97c71e8 100644 --- a/frontend/src/pages/Events/List/hooks/useFilters.ts +++ b/frontend/src/pages/Events/List/hooks/useFilters.ts @@ -19,6 +19,7 @@ type RequestParamsKeys = keyof Pick< | 'target_jobs' | 'target_volumes' | 'target_gateways' + | 'target_secrets' | 'within_projects' | 'within_fleets' | 'within_runs' @@ -35,6 +36,7 @@ const filterKeys: Record = { TARGET_JOBS: 'target_jobs', TARGET_VOLUMES: 'target_volumes', TARGET_GATEWAYS: 'target_gateways', + TARGET_SECRETS: 'target_secrets', WITHIN_PROJECTS: 'within_projects', WITHIN_FLEETS: 'within_fleets', WITHIN_RUNS: 'within_runs', @@ -53,6 +55,7 @@ const multipleChoiseKeys: RequestParamsKeys[] = [ 'target_jobs', 'target_volumes', 'target_gateways', + 'target_secrets', 'within_projects', 'within_fleets', 'within_runs', @@ -69,6 +72,7 @@ const targetTypes = [ { label: 'Job', value: 'job' }, { label: 'Volume', value: 'volume' }, { label: 'Gateway', value: 'gateway' }, + { label: 'Secret', value: 'secret' }, ]; export const useFilters = () => { @@ -171,6 +175,11 @@ export const useFilters = () => { operators: ['='], propertyLabel: 'Target gateways', }, + { + key: filterKeys.TARGET_SECRETS, + operators: ['='], + propertyLabel: 'Target secrets', + }, { key: filterKeys.WITHIN_PROJECTS, diff --git a/frontend/src/types/event.d.ts b/frontend/src/types/event.d.ts index 618ea6673f..dd0147fe15 100644 --- a/frontend/src/types/event.d.ts +++ b/frontend/src/types/event.d.ts @@ -1,4 +1,4 @@ -declare type TEventTargetType = 'project' | 'user' | 'fleet' | 'instance' | 'run' | 'job' | 'volume' | 'gateway'; +declare type TEventTargetType = 'project' | 'user' | 'fleet' | 'instance' | 'run' | 'job' | 'volume' | 'gateway' | 'secret'; declare type TEventListRequestParams = Omit & { prev_recorded_at?: string; @@ -10,6 +10,7 @@ declare type TEventListRequestParams = Omit EventListFilters: " Update the server to 0.20.7 or higher or remove --target-gateway." ) filters.target_gateways.append(id) + elif args.target_secrets: + filters.target_secrets = [ + api.client.secrets.get(api.project, name=name).id for name in args.target_secrets + ] if args.within_fleets: filters.within_fleets = [ diff --git a/src/dstack/_internal/cli/services/events.py b/src/dstack/_internal/cli/services/events.py index 11f764bd15..0f0eb0f4b4 100644 --- a/src/dstack/_internal/cli/services/events.py +++ b/src/dstack/_internal/cli/services/events.py @@ -18,6 +18,7 @@ class EventListFilters: target_runs: Optional[list[uuid.UUID]] = None target_volumes: Optional[list[uuid.UUID]] = None target_gateways: Optional[list[uuid.UUID]] = None + target_secrets: Optional[list[uuid.UUID]] = None within_projects: Optional[list[uuid.UUID]] = None within_fleets: Optional[list[uuid.UUID]] = None within_runs: Optional[list[uuid.UUID]] = None diff --git a/src/dstack/_internal/core/models/events.py b/src/dstack/_internal/core/models/events.py index 289c4fc674..f2efb80d0e 100644 --- a/src/dstack/_internal/core/models/events.py +++ b/src/dstack/_internal/core/models/events.py @@ -18,6 +18,7 @@ class EventTargetType(str, Enum): JOB = "job" VOLUME = "volume" GATEWAY = "gateway" + SECRET = "secret" class EventTarget(CoreModel): diff --git a/src/dstack/_internal/server/routers/events.py b/src/dstack/_internal/server/routers/events.py index 4250eb4d7a..036a8b2be8 100644 --- a/src/dstack/_internal/server/routers/events.py +++ b/src/dstack/_internal/server/routers/events.py @@ -46,6 +46,7 @@ async def list_events( target_jobs=body.target_jobs, target_volumes=body.target_volumes, target_gateways=body.target_gateways, + target_secrets=body.target_secrets, within_projects=body.within_projects, within_fleets=body.within_fleets, within_runs=body.within_runs, diff --git a/src/dstack/_internal/server/schemas/events.py b/src/dstack/_internal/server/schemas/events.py index 30f7fe3244..3899b1f398 100644 --- a/src/dstack/_internal/server/schemas/events.py +++ b/src/dstack/_internal/server/schemas/events.py @@ -102,6 +102,17 @@ class ListEventsRequest(CoreModel): max_items=MAX_FILTER_ITEMS, ), ] = None + target_secrets: Annotated[ + Optional[list[uuid.UUID]], + Field( + description=( + "List of secret IDs." + " The response will only include events that target the specified secrets" + ), + min_items=MIN_FILTER_ITEMS, + max_items=MAX_FILTER_ITEMS, + ), + ] = None within_projects: Annotated[ Optional[list[uuid.UUID]], Field( diff --git a/src/dstack/_internal/server/services/events.py b/src/dstack/_internal/server/services/events.py index c6d35a4577..d46b43e201 100644 --- a/src/dstack/_internal/server/services/events.py +++ b/src/dstack/_internal/server/services/events.py @@ -20,6 +20,7 @@ MemberModel, ProjectModel, RunModel, + SecretModel, UserModel, VolumeModel, ) @@ -93,6 +94,7 @@ def from_model( JobModel, ProjectModel, RunModel, + SecretModel, UserModel, VolumeModel, ], @@ -139,6 +141,13 @@ def from_model( id=model.id, name=model.run_name, ) + if isinstance(model, SecretModel): + return Target( + type=EventTargetType.SECRET, + project_id=model.project_id or model.project.id, + id=model.id, + name=model.name, + ) if isinstance(model, UserModel): return Target( type=EventTargetType.USER, @@ -232,6 +241,7 @@ async def list_events( target_jobs: Optional[list[uuid.UUID]], target_volumes: Optional[list[uuid.UUID]], target_gateways: Optional[list[uuid.UUID]], + target_secrets: Optional[list[uuid.UUID]], within_projects: Optional[list[uuid.UUID]], within_fleets: Optional[list[uuid.UUID]], within_runs: Optional[list[uuid.UUID]], @@ -315,6 +325,13 @@ async def list_events( EventTargetModel.entity_id.in_(target_gateways), ) ) + if target_secrets is not None: + target_filters.append( + and_( + EventTargetModel.entity_type == EventTargetType.SECRET, + EventTargetModel.entity_id.in_(target_secrets), + ) + ) if within_projects is not None: target_filters.append(EventTargetModel.entity_project_id.in_(within_projects)) if within_fleets is not None: diff --git a/src/dstack/api/server/_events.py b/src/dstack/api/server/_events.py index d403fb2427..5aff9c9d21 100644 --- a/src/dstack/api/server/_events.py +++ b/src/dstack/api/server/_events.py @@ -31,6 +31,7 @@ def list( # NOTE: New parameters go here. Avoid positional parameters, they can break compatibility. target_volumes: Optional[list[UUID]] = None, target_gateways: Optional[list[UUID]] = None, + target_secrets: Optional[list[UUID]] = None, ) -> list[Event]: if prev_recorded_at is not None: # Time zones other than UTC are misinterpreted by the server: @@ -45,6 +46,7 @@ def list( target_jobs=target_jobs, target_volumes=target_volumes, target_gateways=target_gateways, + target_secrets=target_secrets, within_projects=within_projects, within_fleets=within_fleets, within_runs=within_runs, From ca56b3b9f9d43879d866242548e4f5e9706e6581 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Tue, 27 Jan 2026 21:21:00 +0100 Subject: [PATCH 087/187] [Docs] Events #3397 (#3506) * [Docs] Events #3397 * Update docs/docs/concepts/events.md Co-authored-by: jvstme <36324149+jvstme@users.noreply.github.com> * Update docs/docs/concepts/events.md Co-authored-by: jvstme <36324149+jvstme@users.noreply.github.com> * Update docs/docs/concepts/events.md Co-authored-by: jvstme <36324149+jvstme@users.noreply.github.com> * Update docs/docs/concepts/events.md Co-authored-by: jvstme <36324149+jvstme@users.noreply.github.com> * Update docs/docs/concepts/events.md Co-authored-by: jvstme <36324149+jvstme@users.noreply.github.com> * Update docs/docs/concepts/events.md Co-authored-by: jvstme <36324149+jvstme@users.noreply.github.com> --------- Co-authored-by: jvstme <36324149+jvstme@users.noreply.github.com> --- docs/docs/concepts/events.md | 70 +++++++++++++++++++++++++ docs/docs/reference/cli/dstack/event.md | 5 +- mkdocs.yml | 4 +- 3 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 docs/docs/concepts/events.md diff --git a/docs/docs/concepts/events.md b/docs/docs/concepts/events.md new file mode 100644 index 0000000000..fb6e0938a2 --- /dev/null +++ b/docs/docs/concepts/events.md @@ -0,0 +1,70 @@ +# Events + +Events provide a chronological record of notable state changes and operations affecting `dstack` resources. They are designed for auditing, debugging, and understanding the lifecycle of runs, jobs, fleets, and other resources. + +Each event includes the following fields: + +| Field | Description | +| --------- | ----------------------------------------------------------- | +| Timestamp | When the event occurred | +| Actor | The user or system that initiated the change, if applicable | +| Targets | The resources affected by the event | +| Message | A description of the change or additional event details | + +Events can be queried by targeting a specific resource or within a group of related resources. For example, you can query events targeting a particular job, or query events within a run, including the run itself and all of its jobs. + +Events are accessible through the UI, CLI, and API. + +## UI + +The UI allows you to query events either globally on the dedicated `Events` page or within a specific group on the page of a run, job, fleet, and other resources. + +### Global page + +The global page shows events from all projects that the user has access to and allows filtering by many fields. + +![](https://dstack.ai/static-assets/static-assets/images/dstack-ui-events-global.png){ width=800 } + +This page allows you to query events targeting a specific resource or within a particular group. + +### Resource page + +The resource page shows events within that specific group. For example, if you open a run and switch to the `Events` tab, you will see all events about that run and its jobs. + +![](https://dstack.ai/static-assets/static-assets/images/dstack-ui-events-run.png){ width=800 } + +## CLI + +To query events via the CLI, use the `dstack event` command. This command provides several arguments that allow filtering by target and within scopes. + +Here is an example of querying all events within a particular run: + +
+ +```shell +$ dstack event --within-run cursor + +[2026-01-21 13:09:37] [👤admin] [run cursor] Run submitted. Status: SUBMITTED +[2026-01-21 13:09:37] [job cursor-0-0] Job created on run submission. Status: SUBMITTED +[2026-01-21 13:09:57] [job cursor-0-0] Job status changed SUBMITTED -> PROVISIONING +[2026-01-21 13:09:58] [job cursor-0-0, instance some-fleet-0] Instance created for job. Instance status: PROVISIONING +[2026-01-21 13:09:59] [run cursor] Run status changed SUBMITTED -> PROVISIONING +[2026-01-21 13:11:22] [job cursor-0-0] Job status changed PROVISIONING -> PULLING +[2026-01-21 13:11:49] [job cursor-0-0] Job status changed PULLING -> RUNNING +[2026-01-21 13:11:51] [run cursor] Run status changed PROVISIONING -> RUNNING +[2026-01-21 13:18:41] [👤admin] [run cursor] Run status changed RUNNING -> TERMINATING. Termination reason: STOPPED_BY_USER +[2026-01-21 13:18:48] [job cursor-0-0] Job status changed RUNNING -> TERMINATING. Termination reason: TERMINATED_BY_USER +[2026-01-21 13:19:05] [instance some-fleet-0, job cursor-0-0] Job unassigned from instance. Instance blocks: 0/1 busy +[2026-01-21 13:19:05] [job cursor-0-0] Job status changed TERMINATING -> TERMINATED +[2026-01-21 13:19:07] [run cursor] Run status changed TERMINATING -> TERMINATED +``` + +
+ +To see all supported arguments, check the [reference](../reference/cli/dstack/event.md). + +If you invoke the command without arguments, it will include all events targeting resources in the project. + +## TTL + +By default, `dstack` stores each event for 30 days and then deletes it. This can be overridden by server administrators using the `DSTACK_SERVER_EVENTS_TTL_SECONDS` environment variable. diff --git a/docs/docs/reference/cli/dstack/event.md b/docs/docs/reference/cli/dstack/event.md index 9854609208..8f90e456c9 100644 --- a/docs/docs/reference/cli/dstack/event.md +++ b/docs/docs/reference/cli/dstack/event.md @@ -1,9 +1,8 @@ # dstack event -The `dstack event` command (an alias for `dstack event list`) allows you to view events. - - +The `dstack event` command, an alias for `dstack event list`, allows you to view events. +For more details, see [Events](../../../concepts/events.md). ## Usage diff --git a/mkdocs.yml b/mkdocs.yml index 5dcdd69999..c7afe30256 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -77,7 +77,8 @@ plugins: - docs/concepts/gateways.md: How to manage gateways - enabling auto-scaling, rate limits, and custom domains - docs/concepts/secrets.md: How to manage secrets - for API keys or other sensitive data - docs/concepts/projects.md: How to manage projects - for managing separate teams - - docs/concepts/metrics.md: How to manage gateways - enabling auto-scaling, rate limits, and custom domains + - docs/concepts/metrics.md: How to access metrics - for monitoring of runs/jobs/fleets, hardware usage (e.g. GPU util), etc + - docs/concepts/events.md: How to access events - aka audit log - for tracking state changes, resource creation/deletion, etc Guides: - docs/guides/server-deployment.md: Detailed guide on how to deploy the dstack server - docs/guides/troubleshooting.md: Common issues and how to troubleshoot them @@ -271,6 +272,7 @@ nav: - Secrets: docs/concepts/secrets.md - Projects: docs/concepts/projects.md - Metrics: docs/concepts/metrics.md + - Events: docs/concepts/events.md - Guides: - Server deployment: docs/guides/server-deployment.md - Troubleshooting: docs/guides/troubleshooting.md From a1f0d58d1adae1da4a7cc6048251ef13c034a765 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Tue, 27 Jan 2026 21:21:32 +0100 Subject: [PATCH 088/187] [UX] Extend `dstack login` with interactive selection of `url` and default project (#3492) * [UX] Extend `dstack login` with interactive selection of `url` and default project * Updated tests * Addressing issues from the PR review (removed overly-agressibe catching of import and other exceptions when detecting if interactive menu is available). Plus, added handling KeyboardException in `dstack login`. --- src/dstack/_internal/cli/commands/login.py | 165 ++++++- src/dstack/_internal/cli/commands/project.py | 14 +- src/dstack/_internal/cli/utils/common.py | 6 +- .../_internal/cli/commands/test_login.py | 414 ++++++++++++++++-- 4 files changed, 520 insertions(+), 79 deletions(-) diff --git a/src/dstack/_internal/cli/commands/login.py b/src/dstack/_internal/cli/commands/login.py index 0919494e63..8431a76c61 100644 --- a/src/dstack/_internal/cli/commands/login.py +++ b/src/dstack/_internal/cli/commands/login.py @@ -1,18 +1,34 @@ import argparse import queue +import sys import threading import urllib.parse import webbrowser from http.server import BaseHTTPRequestHandler, HTTPServer -from typing import Optional +from typing import Any, Optional + +import questionary +from rich.prompt import Prompt as RichPrompt +from rich.text import Text from dstack._internal.cli.commands import BaseCommand +from dstack._internal.cli.commands.project import select_default_project from dstack._internal.cli.utils.common import console, resolve_url from dstack._internal.core.errors import ClientError, CLIError from dstack._internal.core.models.users import UserWithCreds +from dstack._internal.utils.logging import get_logger from dstack.api._public.runs import ConfigManager from dstack.api.server import APIClient +logger = get_logger(__name__) + +is_project_menu_supported = sys.stdin.isatty() + + +class UrlPrompt(RichPrompt): + def render_default(self, default: Any) -> Text: + return Text(f"({default})", style="bold orange1") + class LoginCommand(BaseCommand): NAME = "login" @@ -23,7 +39,7 @@ def _register(self): self._parser.add_argument( "--url", help="The server URL, e.g. https://sky.dstack.ai", - required=True, + required=not is_project_menu_supported, ) self._parser.add_argument( "-p", @@ -33,10 +49,25 @@ def _register(self): " Selected automatically if the server supports only one provider." ), ) + self._parser.add_argument( + "-y", + "--yes", + help="Don't ask for confirmation (e.g. set first project as default)", + action="store_true", + ) + self._parser.add_argument( + "-n", + "--no", + help="Don't ask for confirmation (e.g. do not change default project)", + action="store_true", + ) def _command(self, args: argparse.Namespace): super()._command(args) - base_url = _normalize_url_or_error(args.url) + url = args.url + if url is None: + url = self._prompt_url() + base_url = _normalize_url_or_error(url) api_client = APIClient(base_url=base_url) provider = self._select_provider_or_error(api_client=api_client, provider=args.provider) server = _LoginServer(api_client=api_client, provider=provider) @@ -56,9 +87,9 @@ def _command(self, args: argparse.Namespace): server.shutdown() if user is None: raise CLIError("CLI authentication failed") - console.print(f"Logged in as [code]{user.username}[/].") + console.print(f"Logged in as [code]{user.username}[/]") api_client = APIClient(base_url=base_url, token=user.creds.token) - self._configure_projects(api_client=api_client, user=user) + self._configure_projects(api_client=api_client, user=user, args=args) def _select_provider_or_error(self, api_client: APIClient, provider: Optional[str]) -> str: providers = api_client.auth.list_providers() @@ -67,6 +98,8 @@ def _select_provider_or_error(self, api_client: APIClient, provider: Optional[st raise CLIError("No SSO providers configured on the server.") if provider is None: if len(available_providers) > 1: + if is_project_menu_supported: + return self._prompt_provider(available_providers) raise CLIError( "Specify -p/--provider to choose SSO provider" f" Available providers: {', '.join(available_providers)}" @@ -79,7 +112,37 @@ def _select_provider_or_error(self, api_client: APIClient, provider: Optional[st ) return provider - def _configure_projects(self, api_client: APIClient, user: UserWithCreds): + def _prompt_url(self) -> str: + try: + url = UrlPrompt.ask( + "Enter the server URL", + default="https://sky.dstack.ai", + console=console, + ) + except KeyboardInterrupt: + console.print("\nCancelled by user") + raise SystemExit(1) + if url is None: + raise CLIError("URL is required") + return url + + def _prompt_provider(self, available_providers: list[str]) -> str: + choices = [ + questionary.Choice(title=provider, value=provider) for provider in available_providers + ] + selected_provider = questionary.select( + message="Select SSO provider:", + choices=choices, + qmark="", + instruction="(↑↓ Enter)", + ).ask() + if selected_provider is None: + raise SystemExit(1) + return selected_provider + + def _configure_projects( + self, api_client: APIClient, user: UserWithCreds, args: argparse.Namespace + ): projects = api_client.projects.list(include_not_joined=False) if len(projects) == 0: console.print( @@ -89,30 +152,88 @@ def _configure_projects(self, api_client: APIClient, user: UserWithCreds): return config_manager = ConfigManager() default_project = config_manager.get_project_config() - new_default_project = None - for i, project in enumerate(projects): - set_as_default = ( - default_project is None - and i == 0 - or default_project is not None - and default_project.name == project.project_name - ) - if set_as_default: - new_default_project = project + for project in projects: config_manager.configure_project( name=project.project_name, url=api_client.base_url, token=user.creds.token, - default=set_as_default, + default=False, ) config_manager.save() + project_names = ", ".join(f"[code]{p.project_name}[/]" for p in projects) console.print( - f"Configured projects: {', '.join(f'[code]{p.project_name}[/]' for p in projects)}." + f"Added {project_names} project{'' if len(projects) == 1 else 's'} at {config_manager.config_filepath}" ) - if new_default_project: - console.print( - f"Set project [code]{new_default_project.project_name}[/] as default project." - ) + + project_configs = config_manager.list_project_configs() + + if args.no: + return + + if args.yes: + if len(projects) > 0: + first_project_from_server = projects[0] + first_project_config = next( + ( + pc + for pc in project_configs + if pc.name == first_project_from_server.project_name + ), + None, + ) + if first_project_config is not None: + config_manager.configure_project( + name=first_project_config.name, + url=first_project_config.url, + token=first_project_config.token, + default=True, + ) + config_manager.save() + console.print( + f"Set [code]{first_project_config.name}[/] project as default at {config_manager.config_filepath}" + ) + return + + if len(project_configs) == 1 or not is_project_menu_supported: + selected_project = None + if len(project_configs) == 1: + selected_project = project_configs[0] + else: + for i, project in enumerate(projects): + set_as_default = ( + default_project is None + and i == 0 + or default_project is not None + and default_project.name == project.project_name + ) + if set_as_default: + selected_project = next( + (pc for pc in project_configs if pc.name == project.project_name), + None, + ) + break + if selected_project is not None: + config_manager.configure_project( + name=selected_project.name, + url=selected_project.url, + token=selected_project.token, + default=True, + ) + config_manager.save() + console.print( + f"Set [code]{selected_project.name}[/] project as default at {config_manager.config_filepath}" + ) + else: + console.print() + selected_project = select_default_project(project_configs, default_project) + if selected_project is not None: + config_manager.configure_project( + name=selected_project.name, + url=selected_project.url, + token=selected_project.token, + default=True, + ) + config_manager.save() class _BadRequestError(Exception): diff --git a/src/dstack/_internal/cli/commands/project.py b/src/dstack/_internal/cli/commands/project.py index db4a7a5eb9..5edd4e64ec 100644 --- a/src/dstack/_internal/cli/commands/project.py +++ b/src/dstack/_internal/cli/commands/project.py @@ -2,16 +2,10 @@ import sys from typing import Any, Optional, Union +import questionary from requests import HTTPError from rich.table import Table -try: - import questionary - - is_project_menu_supported = sys.stdin.isatty() -except (ImportError, NotImplementedError, AttributeError): - is_project_menu_supported = False - import dstack.api.server from dstack._internal.cli.commands import BaseCommand from dstack._internal.cli.utils.common import add_row_from_dict, confirm_ask, console @@ -22,6 +16,8 @@ logger = get_logger(__name__) +is_project_menu_supported = sys.stdin.isatty() + def select_default_project( project_configs: list[ProjectConfig], default_project: Optional[ProjectConfig] @@ -57,9 +53,9 @@ def select_default_project( default_index = i menu_entries.append((entry, i)) - choices = [questionary.Choice(title=entry, value=index) for entry, index in menu_entries] # pyright: ignore[reportPossiblyUnboundVariable] + choices = [questionary.Choice(title=entry, value=index) for entry, index in menu_entries] default_value = default_index - selected_index = questionary.select( # pyright: ignore[reportPossiblyUnboundVariable] + selected_index = questionary.select( message="Select the default project:", choices=choices, default=default_value, # pyright: ignore[reportArgumentType] diff --git a/src/dstack/_internal/cli/utils/common.py b/src/dstack/_internal/cli/utils/common.py index d53b84567b..afbfb2d8cb 100644 --- a/src/dstack/_internal/cli/utils/common.py +++ b/src/dstack/_internal/cli/utils/common.py @@ -99,7 +99,11 @@ def configure_logging(): def confirm_ask(prompt, **kwargs) -> bool: kwargs["console"] = console - return Confirm.ask(prompt=prompt, **kwargs) + try: + return Confirm.ask(prompt=prompt, **kwargs) + except KeyboardInterrupt: + console.print("\nCancelled by user") + raise SystemExit(1) def add_row_from_dict(table: Table, data: Dict[Union[str, int], Any], **kwargs): diff --git a/src/tests/_internal/cli/commands/test_login.py b/src/tests/_internal/cli/commands/test_login.py index bd7eba584f..4e0c27ae34 100644 --- a/src/tests/_internal/cli/commands/test_login.py +++ b/src/tests/_internal/cli/commands/test_login.py @@ -8,34 +8,72 @@ class TestLogin: + @staticmethod + def _setup_auth_mocks(api_client_mock, login_server_mock, user_token="token"): + """Set up common authentication mocks.""" + api_client_mock.return_value.auth.list_providers.return_value = [ + SimpleNamespace(name="github", enabled=True) + ] + api_client_mock.return_value.auth.authorize.return_value = SimpleNamespace( + authorization_url="http://auth_url" + ) + user = SimpleNamespace(username="me", creds=SimpleNamespace(token=user_token)) + login_server_mock.return_value.get_logged_in_user.return_value = user + return user + + @staticmethod + def _setup_config_manager_with_state_tracking( + config_manager_mock, tmp_path: Path, project_configs: list[SimpleNamespace] + ): + """Set up ConfigManager mock with state tracking via side effects.""" + config_manager_mock.return_value.config_filepath = tmp_path / "config.yml" + config_manager_mock.return_value.list_project_configs.return_value = project_configs + + def configure_project_side_effect(name, url, token, default): + for pc in project_configs: + if pc.name == name: + pc.url = url + pc.token = token + if default: + for p in project_configs: + p.default = False + pc.default = default or pc.default + return + + def get_project_config_side_effect(name=None): + if name is None: + for pc in project_configs: + if pc.default: + return pc + return None + for pc in project_configs: + if pc.name == name: + return pc + return None + + config_manager_mock.return_value.configure_project.side_effect = ( + configure_project_side_effect + ) + config_manager_mock.return_value.get_project_config.side_effect = ( + get_project_config_side_effect + ) + def test_login_no_projects(self, capsys: CaptureFixture, tmp_path: Path): with ( patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, - patch("dstack._internal.cli.commands.login.APIClient") as APIClientMock, - patch("dstack._internal.cli.commands.login._LoginServer") as LoginServerMock, + patch("dstack._internal.cli.commands.login.APIClient") as api_client_mock, + patch("dstack._internal.cli.commands.login._LoginServer") as login_server_mock, patch( "dstack._internal.cli.commands.login._normalize_url_or_error" - ) as _normalize_url_or_error_mock, + ) as normalize_url_mock, ): webbrowser_mock.open.return_value = True - _normalize_url_or_error_mock.return_value = "http://127.0.0.1:31313" - APIClientMock.return_value.auth.list_providers.return_value = [ - SimpleNamespace(name="github", enabled=True) - ] - APIClientMock.return_value.auth.authorize.return_value = SimpleNamespace( - authorization_url="http://auth_url" - ) - APIClientMock.return_value.projects.list.return_value = [] - user = SimpleNamespace(username="me", creds=SimpleNamespace(token="token")) - LoginServerMock.return_value.get_logged_in_user.return_value = user + normalize_url_mock.return_value = "http://127.0.0.1:31313" + self._setup_auth_mocks(api_client_mock, login_server_mock) + api_client_mock.return_value.projects.list.return_value = [] + exit_code = run_dstack_cli( - [ - "login", - "--url", - "http://127.0.0.1:31313", - "--provider", - "github", - ], + ["login", "--url", "http://127.0.0.1:31313", "--provider", "github"], home_dir=tmp_path, ) @@ -43,53 +81,331 @@ def test_login_no_projects(self, capsys: CaptureFixture, tmp_path: Path): assert capsys.readouterr().out.replace("\n", "") == ( "Your browser has been opened to log in with Github:" "http://auth_url" - "Logged in as me." + "Logged in as me" "No projects configured. Create your own project via the UI or contact a project manager to add you to the project." ) def test_login_configures_projects(self, capsys: CaptureFixture, tmp_path: Path): with ( patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, - patch("dstack._internal.cli.commands.login.APIClient") as APIClientMock, - patch("dstack._internal.cli.commands.login.ConfigManager") as ConfigManagerMock, - patch("dstack._internal.cli.commands.login._LoginServer") as LoginServerMock, + patch("dstack._internal.cli.commands.login.APIClient") as api_client_mock, + patch("dstack._internal.cli.commands.login.ConfigManager") as config_manager_mock, + patch("dstack._internal.cli.commands.login._LoginServer") as login_server_mock, patch( "dstack._internal.cli.commands.login._normalize_url_or_error" - ) as _normalize_url_or_error_mock, + ) as normalize_url_mock, ): - _normalize_url_or_error_mock.return_value = "http://127.0.0.1:31313" webbrowser_mock.open.return_value = True - APIClientMock.return_value.auth.list_providers.return_value = [ - SimpleNamespace(name="github", enabled=True) + normalize_url_mock.return_value = "http://127.0.0.1:31313" + user = self._setup_auth_mocks(api_client_mock, login_server_mock) + api_client_mock.return_value.projects.list.return_value = [ + SimpleNamespace(project_name="project1"), + SimpleNamespace(project_name="project2"), ] - APIClientMock.return_value.auth.authorize.return_value = SimpleNamespace( - authorization_url="http://auth_url" + api_client_mock.return_value.base_url = "http://127.0.0.1:31313" + + project_configs = [ + SimpleNamespace( + name="project1", url="http://127.0.0.1:31313", token="token", default=False + ), + SimpleNamespace( + name="project2", url="http://127.0.0.1:31313", token="token", default=False + ), + ] + config_manager_mock.return_value.get_project_config.return_value = None + self._setup_config_manager_with_state_tracking( + config_manager_mock, tmp_path, project_configs + ) + + exit_code = run_dstack_cli( + ["login", "--url", "http://127.0.0.1:31313", "--provider", "github"], + home_dir=tmp_path, + ) + + config_manager_mock.return_value.configure_project.assert_has_calls( + [ + call( + name="project1", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project2", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project1", url="http://127.0.0.1:31313", token="token", default=True + ), + ] ) - APIClientMock.return_value.projects.list.return_value = [ + config_manager_mock.return_value.save.assert_called() + final_default = config_manager_mock.return_value.get_project_config() + assert final_default is not None + assert final_default.name == "project1" + + assert exit_code == 0 + assert capsys.readouterr().out.replace("\n", "") == ( + "Your browser has been opened to log in with Github:" + "http://auth_url" + "Logged in as me" + f"Added project1, project2 projects at {tmp_path / 'config.yml'}" + f"Set project1 project as default at {tmp_path / 'config.yml'}" + ) + + def test_login_configures_projects_yes_sets_first_project_default( + self, capsys: CaptureFixture, tmp_path: Path + ): + with ( + patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, + patch("dstack._internal.cli.commands.login.APIClient") as api_client_mock, + patch("dstack._internal.cli.commands.login.ConfigManager") as config_manager_mock, + patch("dstack._internal.cli.commands.login._LoginServer") as login_server_mock, + patch( + "dstack._internal.cli.commands.login._normalize_url_or_error" + ) as normalize_url_mock, + ): + webbrowser_mock.open.return_value = True + normalize_url_mock.return_value = "http://127.0.0.1:31313" + user = self._setup_auth_mocks(api_client_mock, login_server_mock) + api_client_mock.return_value.projects.list.return_value = [ + SimpleNamespace(project_name="project1"), + SimpleNamespace(project_name="project2"), + ] + api_client_mock.return_value.base_url = "http://127.0.0.1:31313" + + project_configs = [ + SimpleNamespace( + name="project1", url="http://127.0.0.1:31313", token="token", default=False + ), + SimpleNamespace( + name="project2", url="http://127.0.0.1:31313", token="token", default=True + ), + ] + self._setup_config_manager_with_state_tracking( + config_manager_mock, tmp_path, project_configs + ) + + exit_code = run_dstack_cli( + ["login", "--url", "http://127.0.0.1:31313", "--provider", "github", "--yes"], + home_dir=tmp_path, + ) + + config_manager_mock.return_value.configure_project.assert_has_calls( + [ + call( + name="project1", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project2", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project1", url="http://127.0.0.1:31313", token="token", default=True + ), + ] + ) + final_default = config_manager_mock.return_value.get_project_config() + assert final_default is not None + assert final_default.name == "project1" + + assert exit_code == 0 + assert capsys.readouterr().out.replace("\n", "") == ( + "Your browser has been opened to log in with Github:" + "http://auth_url" + "Logged in as me" + f"Added project1, project2 projects at {tmp_path / 'config.yml'}" + f"Set project1 project as default at {tmp_path / 'config.yml'}" + ) + + def test_login_configures_projects_no_does_not_change_default( + self, capsys: CaptureFixture, tmp_path: Path + ): + with ( + patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, + patch("dstack._internal.cli.commands.login.APIClient") as api_client_mock, + patch("dstack._internal.cli.commands.login.ConfigManager") as config_manager_mock, + patch("dstack._internal.cli.commands.login._LoginServer") as login_server_mock, + patch( + "dstack._internal.cli.commands.login._normalize_url_or_error" + ) as normalize_url_mock, + ): + webbrowser_mock.open.return_value = True + normalize_url_mock.return_value = "http://127.0.0.1:31313" + user = self._setup_auth_mocks(api_client_mock, login_server_mock) + api_client_mock.return_value.projects.list.return_value = [ SimpleNamespace(project_name="project1"), SimpleNamespace(project_name="project2"), ] - APIClientMock.return_value.base_url = "http://127.0.0.1:31313" - ConfigManagerMock.return_value.get_project_config.return_value = None - user = SimpleNamespace(username="me", creds=SimpleNamespace(token="token")) - LoginServerMock.return_value.get_logged_in_user.return_value = user + api_client_mock.return_value.base_url = "http://127.0.0.1:31313" + + project_configs = [ + SimpleNamespace( + name="project1", url="http://127.0.0.1:31313", token="token", default=False + ), + SimpleNamespace( + name="project2", url="http://127.0.0.1:31313", token="token", default=True + ), + ] + self._setup_config_manager_with_state_tracking( + config_manager_mock, tmp_path, project_configs + ) + exit_code = run_dstack_cli( + ["login", "--url", "http://127.0.0.1:31313", "--provider", "github", "--no"], + home_dir=tmp_path, + ) + + config_manager_mock.return_value.configure_project.assert_has_calls( [ - "login", - "--url", - "http://127.0.0.1:31313", - "--provider", - "github", - ], + call( + name="project1", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + call( + name="project2", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, + ), + ] + ) + assert ( + call(name="project1", url="http://127.0.0.1:31313", token="token", default=True) + not in config_manager_mock.return_value.configure_project.mock_calls + ) + final_default = config_manager_mock.return_value.get_project_config() + assert final_default is not None + assert final_default.name == "project2" + + assert exit_code == 0 + assert capsys.readouterr().out.replace("\n", "") == ( + "Your browser has been opened to log in with Github:" + "http://auth_url" + "Logged in as me" + f"Added project1, project2 projects at {tmp_path / 'config.yml'}" + ) + + def test_login_single_project_auto_default(self, capsys: CaptureFixture, tmp_path: Path): + with ( + patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, + patch("dstack._internal.cli.commands.login.APIClient") as api_client_mock, + patch("dstack._internal.cli.commands.login.ConfigManager") as config_manager_mock, + patch("dstack._internal.cli.commands.login._LoginServer") as login_server_mock, + patch( + "dstack._internal.cli.commands.login._normalize_url_or_error" + ) as normalize_url_mock, + ): + webbrowser_mock.open.return_value = True + normalize_url_mock.return_value = "http://127.0.0.1:31313" + user = self._setup_auth_mocks(api_client_mock, login_server_mock) + api_client_mock.return_value.projects.list.return_value = [ + SimpleNamespace(project_name="project1"), + ] + api_client_mock.return_value.base_url = "http://127.0.0.1:31313" + + project_configs = [ + SimpleNamespace( + name="project1", url="http://127.0.0.1:31313", token="token", default=False + ), + ] + config_manager_mock.return_value.get_project_config.return_value = None + self._setup_config_manager_with_state_tracking( + config_manager_mock, tmp_path, project_configs + ) + + exit_code = run_dstack_cli( + ["login", "--url", "http://127.0.0.1:31313", "--provider", "github"], home_dir=tmp_path, ) - ConfigManagerMock.return_value.configure_project.assert_has_calls( + + config_manager_mock.return_value.configure_project.assert_has_calls( [ call( name="project1", url="http://127.0.0.1:31313", token=user.creds.token, - default=True, + default=False, + ), + call( + name="project1", url="http://127.0.0.1:31313", token="token", default=True + ), + ] + ) + final_default = config_manager_mock.return_value.get_project_config() + assert final_default is not None + assert final_default.name == "project1" + + assert exit_code == 0 + assert capsys.readouterr().out.replace("\n", "") == ( + "Your browser has been opened to log in with Github:" + "http://auth_url" + "Logged in as me" + f"Added project1 project at {tmp_path / 'config.yml'}" + f"Set project1 project as default at {tmp_path / 'config.yml'}" + ) + + def test_login_interactive_prompts_for_default_project( + self, capsys: CaptureFixture, tmp_path: Path + ): + with ( + patch("dstack._internal.cli.commands.login.webbrowser") as webbrowser_mock, + patch("dstack._internal.cli.commands.login.APIClient") as api_client_mock, + patch("dstack._internal.cli.commands.login.ConfigManager") as config_manager_mock, + patch("dstack._internal.cli.commands.login._LoginServer") as login_server_mock, + patch( + "dstack._internal.cli.commands.login._normalize_url_or_error" + ) as normalize_url_mock, + patch( + "dstack._internal.cli.commands.login.select_default_project" + ) as select_default_project_mock, + patch("dstack._internal.cli.commands.login.is_project_menu_supported", True), + ): + webbrowser_mock.open.return_value = True + normalize_url_mock.return_value = "http://127.0.0.1:31313" + user = self._setup_auth_mocks(api_client_mock, login_server_mock) + api_client_mock.return_value.projects.list.return_value = [ + SimpleNamespace(project_name="project1"), + SimpleNamespace(project_name="project2"), + ] + api_client_mock.return_value.base_url = "http://127.0.0.1:31313" + + project_configs = [ + SimpleNamespace( + name="project1", url="http://127.0.0.1:31313", token="token", default=False + ), + SimpleNamespace( + name="project2", url="http://127.0.0.1:31313", token="token", default=False + ), + ] + config_manager_mock.return_value.get_project_config.return_value = None + self._setup_config_manager_with_state_tracking( + config_manager_mock, tmp_path, project_configs + ) + select_default_project_mock.return_value = project_configs[1] + + exit_code = run_dstack_cli( + ["login", "--url", "http://127.0.0.1:31313", "--provider", "github"], + home_dir=tmp_path, + ) + + select_default_project_mock.assert_called_once() + config_manager_mock.return_value.configure_project.assert_has_calls( + [ + call( + name="project1", + url="http://127.0.0.1:31313", + token=user.creds.token, + default=False, ), call( name="project2", @@ -97,15 +413,19 @@ def test_login_configures_projects(self, capsys: CaptureFixture, tmp_path: Path) token=user.creds.token, default=False, ), + call( + name="project2", url="http://127.0.0.1:31313", token="token", default=True + ), ] ) - ConfigManagerMock.return_value.save.assert_called() + final_default = config_manager_mock.return_value.get_project_config() + assert final_default is not None + assert final_default.name == "project2" assert exit_code == 0 assert capsys.readouterr().out.replace("\n", "") == ( "Your browser has been opened to log in with Github:" "http://auth_url" - "Logged in as me." - "Configured projects: project1, project2." - "Set project project1 as default project." + "Logged in as me" + f"Added project1, project2 projects at {tmp_path / 'config.yml'}" ) From bef08a593979e2b68ac3e6125061c5f85611dcfb Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Wed, 28 Jan 2026 09:30:58 +0000 Subject: [PATCH 089/187] Add secret lifecycle events (#3505) - Secret created - Secret updated - Secret deleted --- .../_internal/server/routers/secrets.py | 6 +- .../_internal/server/services/secrets.py | 127 ++++++++++++------ .../_internal/server/routers/test_secrets.py | 32 +++++ 3 files changed, 120 insertions(+), 45 deletions(-) diff --git a/src/dstack/_internal/server/routers/secrets.py b/src/dstack/_internal/server/routers/secrets.py index c19f15bccc..30cbdc60c7 100644 --- a/src/dstack/_internal/server/routers/secrets.py +++ b/src/dstack/_internal/server/routers/secrets.py @@ -59,13 +59,14 @@ async def create_or_update_secret( session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), ): - _, project = user_project + user, project = user_project return CustomORJSONResponse( await secrets_services.create_or_update_secret( session=session, project=project, name=body.name, value=body.value, + user=user, ) ) @@ -76,9 +77,10 @@ async def delete_secrets( session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), ): - _, project = user_project + user, project = user_project await secrets_services.delete_secrets( session=session, project=project, names=body.secrets_names, + user=user, ) diff --git a/src/dstack/_internal/server/services/secrets.py b/src/dstack/_internal/server/services/secrets.py index 1cea57f655..ed12576256 100644 --- a/src/dstack/_internal/server/services/secrets.py +++ b/src/dstack/_internal/server/services/secrets.py @@ -1,8 +1,11 @@ import re +import uuid +from collections.abc import AsyncGenerator +from contextlib import asynccontextmanager from typing import Dict, List, Optional import sqlalchemy.exc -from sqlalchemy import delete, select, update +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.core.errors import ( @@ -11,11 +14,10 @@ ServerClientError, ) from dstack._internal.core.models.secrets import Secret -from dstack._internal.server.models import DecryptedString, ProjectModel, SecretModel -from dstack._internal.utils.logging import get_logger - -logger = get_logger(__name__) - +from dstack._internal.server.db import get_db +from dstack._internal.server.models import DecryptedString, ProjectModel, SecretModel, UserModel +from dstack._internal.server.services import events +from dstack._internal.server.services.locking import get_locker _SECRET_NAME_REGEX = "^[A-Za-z0-9-_]{1,200}$" _SECRET_VALUE_MAX_LENGTH = 5000 @@ -57,6 +59,7 @@ async def create_or_update_secret( project: ProjectModel, name: str, value: str, + user: UserModel, ) -> Secret: _validate_secret(name=name, value=value) try: @@ -65,6 +68,7 @@ async def create_or_update_secret( project=project, name=name, value=value, + user=user, ) except ResourceExistsError: secret_model = await update_secret( @@ -72,6 +76,7 @@ async def create_or_update_secret( project=project, name=name, value=value, + user=user, ) return secret_model_to_secret(secret_model, include_value=True) @@ -80,26 +85,24 @@ async def delete_secrets( session: AsyncSession, project: ProjectModel, names: List[str], + user: UserModel, ): - existing_secrets_query = await session.execute( - select(SecretModel).where( - SecretModel.project_id == project.id, - SecretModel.name.in_(names), - ) - ) - existing_names = [s.name for s in existing_secrets_query.scalars().all()] - missing_names = set(names) - set(existing_names) - if missing_names: - raise ResourceNotExistsError(f"Secrets not found: {', '.join(missing_names)}") - - await session.execute( - delete(SecretModel).where( - SecretModel.project_id == project.id, - SecretModel.name.in_(names), - ) - ) - await session.commit() - logger.info("Deleted secrets %s in project %s", names, project.name) + async with get_project_secret_models_by_name_for_update( + session=session, project=project, names=names + ) as secret_models: + existing_names = [s.name for s in secret_models] + missing_names = set(names) - set(existing_names) + if missing_names: + raise ResourceNotExistsError(f"Secrets not found: {', '.join(missing_names)}") + for secret_model in secret_models: + await session.delete(secret_model) + events.emit( + session, + "Secret deleted", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(secret_model)], + ) + await session.commit() def secret_model_to_secret(secret_model: SecretModel, include_value: bool = False) -> Secret: @@ -142,13 +145,47 @@ async def get_project_secret_model_by_name( return res.scalar_one_or_none() +@asynccontextmanager +async def get_project_secret_models_by_name_for_update( + session: AsyncSession, project: ProjectModel, names: list[str] +) -> AsyncGenerator[list[SecretModel], None]: + """ + Fetch secrets from the database and lock them for update. + + **NOTE**: commit changes to the database before exiting from this context manager, + so that in-memory locks are only released after commit. + """ + filters = [ + SecretModel.project_id == project.id, + SecretModel.name.in_(names), + ] + res = await session.execute(select(SecretModel.id).where(*filters)) + secret_ids = res.scalars().all() + if not secret_ids: + yield [] + else: + async with get_locker(get_db().dialect_name).lock_ctx( + SecretModel.__tablename__, sorted(secret_ids) + ): + # Refetch after lock + res = await session.execute( + select(SecretModel) + .where(SecretModel.id.in_(secret_ids), *filters) + .with_for_update(key_share=True) + .order_by(SecretModel.id) # take locks in order + ) + yield list(res.scalars().all()) + + async def create_secret( session: AsyncSession, project: ProjectModel, name: str, value: str, + user: UserModel, ) -> SecretModel: secret_model = SecretModel( + id=uuid.uuid4(), project_id=project.id, name=name, value=DecryptedString(plaintext=value), @@ -156,6 +193,12 @@ async def create_secret( try: async with session.begin_nested(): session.add(secret_model) + events.emit( + session, + "Secret created", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(secret_model)], + ) except sqlalchemy.exc.IntegrityError: raise ResourceExistsError() await session.commit() @@ -167,25 +210,23 @@ async def update_secret( project: ProjectModel, name: str, value: str, + user: UserModel, ) -> SecretModel: - await session.execute( - update(SecretModel) - .where( - SecretModel.project_id == project.id, - SecretModel.name == name, - ) - .values( - value=DecryptedString(plaintext=value), - ) - ) - await session.commit() - secret_model = await get_project_secret_model_by_name( - session=session, - project=project, - name=name, - ) - if secret_model is None: - raise ResourceNotExistsError() + async with get_project_secret_models_by_name_for_update( + session=session, project=project, names=[name] + ) as secret_models: + if not secret_models: + raise ResourceNotExistsError() + secret_model = secret_models[0] + if secret_model.value.get_plaintext_or_error() != value: + secret_model.value = DecryptedString(plaintext=value) + events.emit( + session, + "Secret updated", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(secret_model)], + ) + await session.commit() return secret_model diff --git a/src/tests/_internal/server/routers/test_secrets.py b/src/tests/_internal/server/routers/test_secrets.py index a2279c770c..302591b881 100644 --- a/src/tests/_internal/server/routers/test_secrets.py +++ b/src/tests/_internal/server/routers/test_secrets.py @@ -11,6 +11,7 @@ create_secret, create_user, get_auth_headers, + list_events, ) @@ -145,6 +146,9 @@ async def test_creates_secret(self, test_db, session: AsyncSession, client: Asyn res = await session.execute(select(SecretModel)) secret_model = res.scalar() assert secret_model is not None + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Secret created" @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @@ -165,6 +169,29 @@ async def test_updates_secret(self, test_db, session: AsyncSession, client: Asyn assert response.status_code == 200 await session.refresh(secret) assert secret.value.get_plaintext_or_error() == "new_value" + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Secret updated" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_no_event_if_value_unchanged( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + await create_secret(session=session, project=project, name="secret1", value="value") + response = await client.post( + f"/api/project/{project.name}/secrets/create_or_update", + headers=get_auth_headers(user.token), + json={"name": "secret1", "value": "value"}, + ) + assert response.status_code == 200 + events = await list_events(session) + assert len(events) == 0 @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @@ -253,6 +280,11 @@ async def test_deletes_secrets(self, test_db, session: AsyncSession, client: Asy assert len(secrets) == 1 assert secrets[0].name == "secret2" + # Verify event was emitted + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Secret deleted" + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_delete_nonexistent_secret_raises_error( From b7f637b1885e120b292a300f499c79c6f49541d3 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Wed, 28 Jan 2026 09:33:15 +0000 Subject: [PATCH 090/187] Fix apply plan compatibility with old servers (#3507) --- src/dstack/_internal/core/compatibility/runs.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/dstack/_internal/core/compatibility/runs.py b/src/dstack/_internal/core/compatibility/runs.py index 5bec348d67..f1b7dc5222 100644 --- a/src/dstack/_internal/core/compatibility/runs.py +++ b/src/dstack/_internal/core/compatibility/runs.py @@ -1,7 +1,12 @@ from typing import Optional from dstack._internal.core.models.common import IncludeExcludeDictType, IncludeExcludeSetType -from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSpec, RunSpec +from dstack._internal.core.models.runs import ( + DEFAULT_REPLICA_GROUP_NAME, + ApplyRunPlanInput, + JobSpec, + RunSpec, +) from dstack._internal.server.schemas.runs import GetRunPlanRequest, ListRunsRequest @@ -23,7 +28,13 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD current_resource = plan.current_resource if current_resource is not None: current_resource_excludes: IncludeExcludeDictType = {} + apply_plan_excludes["current_resource"] = current_resource_excludes current_resource_excludes["run_spec"] = get_run_spec_excludes(current_resource.run_spec) + current_resource_excludes["jobs"] = { + "__all__": { + "job_spec": get_job_spec_excludes([job.job_spec for job in current_resource.jobs]), + } + } return {"plan": apply_plan_excludes} @@ -70,4 +81,6 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType: clients backward-compatibility with older servers. """ spec_excludes: IncludeExcludeDictType = {} + if all(s.replica_group == DEFAULT_REPLICA_GROUP_NAME for s in job_specs): + spec_excludes["replica_group"] = True return spec_excludes From 6da1554063294252261db0e6b2e42af550fb2d0e Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 28 Jan 2026 14:26:10 +0100 Subject: [PATCH 091/187] [UI] Minor tweaks (#3508) --- .../AppLayout/TutorialPanel/constants.tsx | 10 +++++----- .../src/layouts/AppLayout/TutorialPanel/hooks.ts | 16 ++++++++-------- frontend/src/layouts/AppLayout/index.tsx | 2 +- frontend/src/locale/en.json | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/frontend/src/layouts/AppLayout/TutorialPanel/constants.tsx b/frontend/src/layouts/AppLayout/TutorialPanel/constants.tsx index 7d4bb378b2..4ee909c0cf 100644 --- a/frontend/src/layouts/AppLayout/TutorialPanel/constants.tsx +++ b/frontend/src/layouts/AppLayout/TutorialPanel/constants.tsx @@ -8,7 +8,7 @@ import { Box } from 'components'; export const tutorialPanelI18nStrings: TutorialPanelProps.I18nStrings = { labelsTaskStatus: { pending: 'Pending', 'in-progress': 'In progress', success: 'Success' }, loadingText: 'Loading', - tutorialListTitle: 'Take a tour', + tutorialListTitle: '', tutorialListDescription: 'Follow the tutorials below to get up to speed with dstack Sky.', tutorialListDownloadLinkText: 'Download PDF version', tutorialCompletedText: 'Completed', @@ -50,7 +50,7 @@ export enum HotspotIds { export const BILLING_TUTORIAL: TutorialPanelProps.Tutorial = { completed: false, - title: 'Set up billing', + title: 'Billing', description: ( <> @@ -80,7 +80,7 @@ export const BILLING_TUTORIAL: TutorialPanelProps.Tutorial = { export const CONFIGURE_CLI_TUTORIAL: TutorialPanelProps.Tutorial = { completed: false, - title: 'Set up the CLI', + title: 'CLI', prerequisitesAlert: 'Please, create a project before set up the CLI', description: ( <> @@ -111,7 +111,7 @@ export const CONFIGURE_CLI_TUTORIAL: TutorialPanelProps.Tutorial = { export const CREATE_FIRST_PROJECT: TutorialPanelProps.Tutorial = { completed: false, - title: 'Create a project', + title: 'Project', description: ( <> @@ -136,7 +136,7 @@ export const CREATE_FIRST_PROJECT: TutorialPanelProps.Tutorial = { export const JOIN_DISCORD_TUTORIAL: TutorialPanelProps.Tutorial = { completed: false, - title: 'Community', + title: 'Discord', description: ( <> diff --git a/frontend/src/layouts/AppLayout/TutorialPanel/hooks.ts b/frontend/src/layouts/AppLayout/TutorialPanel/hooks.ts index d3a465fcb5..74d166ba8d 100644 --- a/frontend/src/layouts/AppLayout/TutorialPanel/hooks.ts +++ b/frontend/src/layouts/AppLayout/TutorialPanel/hooks.ts @@ -167,14 +167,6 @@ export const useTutorials = () => { prerequisitesNeeded: !createProjectCompleted, }, - { - ...BILLING_TUTORIAL, - id: 4, - completed: billingCompleted, - startCallback: startBillingTutorial, - finishCallback: finishBillingTutorial, - }, - { ...QUICKSTART_TUTORIAL, id: 5, @@ -190,6 +182,14 @@ export const useTutorials = () => { completed: discordCompleted, startCallback: startDiscordTutorial, }, + + { + ...BILLING_TUTORIAL, + id: 4, + completed: billingCompleted, + startCallback: startBillingTutorial, + finishCallback: finishBillingTutorial, + }, ]; }, [ billingUrl, diff --git a/frontend/src/layouts/AppLayout/index.tsx b/frontend/src/layouts/AppLayout/index.tsx index d417779d41..3dd4db470b 100644 --- a/frontend/src/layouts/AppLayout/index.tsx +++ b/frontend/src/layouts/AppLayout/index.tsx @@ -183,7 +183,7 @@ const AppLayout: React.FC<{ children: React.ReactNode }> = ({ children }) => { }, process.env.UI_VERSION === 'sky' && { type: 'button', - iconName: 'suggestions', + iconName: 'support', title: t('common.tutorial_other'), onClick: toggleTutorialPanel, }, diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index c821abf317..342ce0cdd7 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -46,7 +46,7 @@ "continue": "Continue", "select_visible_columns": "Select visible columns", "tutorial": "Tutorials", - "tutorial_other": "Tour", + "tutorial_other": "Take a tour", "docs": "Docs", "discord": "Discord", "danger_zone": "Danger Zone", From 7ba2f3c17155f0082a4a3706361d23d6781744da Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:30:59 +0000 Subject: [PATCH 092/187] Fix `dstack event` compat. with older servers (#3509) Fixes this error: ``` $ dstack event Server validation error: {'detail': [{'loc': ['body', 'target_gateways'], 'msg': 'extra fields not permitted', 'type': 'value_error.extra'}, {'loc': ['body', 'target_secrets'], 'msg': 'extra fields not permitted', 'type': 'value_error.extra'}, {'loc': ['body', 'target_volumes'], 'msg': 'extra fields not permitted', 'type': 'value_error.extra'}]} ``` --- src/dstack/_internal/core/compatibility/events.py | 13 +++++++++++++ src/dstack/api/server/_events.py | 5 ++++- 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 src/dstack/_internal/core/compatibility/events.py diff --git a/src/dstack/_internal/core/compatibility/events.py b/src/dstack/_internal/core/compatibility/events.py new file mode 100644 index 0000000000..b28db11587 --- /dev/null +++ b/src/dstack/_internal/core/compatibility/events.py @@ -0,0 +1,13 @@ +from dstack._internal.core.models.common import IncludeExcludeDictType +from dstack._internal.server.schemas.events import ListEventsRequest + + +def get_list_events_excludes(request: ListEventsRequest) -> IncludeExcludeDictType: + list_gpus_excludes: IncludeExcludeDictType = {} + if request.target_volumes is None: + list_gpus_excludes["target_volumes"] = True + if request.target_gateways is None: + list_gpus_excludes["target_gateways"] = True + if request.target_secrets is None: + list_gpus_excludes["target_secrets"] = True + return list_gpus_excludes diff --git a/src/dstack/api/server/_events.py b/src/dstack/api/server/_events.py index 5aff9c9d21..22cd8893cd 100644 --- a/src/dstack/api/server/_events.py +++ b/src/dstack/api/server/_events.py @@ -4,6 +4,7 @@ from pydantic import parse_obj_as +from dstack._internal.core.compatibility.events import get_list_events_excludes from dstack._internal.core.models.events import Event, EventTargetType from dstack._internal.server.schemas.events import LIST_EVENTS_DEFAULT_LIMIT, ListEventsRequest from dstack.api.server._group import APIClientGroup @@ -57,5 +58,7 @@ def list( limit=limit, ascending=ascending, ) - resp = self._request("/api/events/list", body=req.json()) + resp = self._request( + "/api/events/list", body=req.json(exclude=get_list_events_excludes(req)) + ) return parse_obj_as(list[Event.__response__], resp.json()) From 763092d92074d22888ae7ab91192ddeb8085c249 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Wed, 28 Jan 2026 13:31:12 +0000 Subject: [PATCH 093/187] Fix scaling during update to replica groups (#3510) This fix prevents the number of replicas from dropping to `replicas.min` for all existing services during a server update to 0.20.7. --- .../server/services/services/__init__.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/dstack/_internal/server/services/services/__init__.py b/src/dstack/_internal/server/services/services/__init__.py index 1f7dcde79d..45bb1fe0fe 100644 --- a/src/dstack/_internal/server/services/services/__init__.py +++ b/src/dstack/_internal/server/services/services/__init__.py @@ -19,7 +19,11 @@ ServerClientError, SSHError, ) -from dstack._internal.core.models.configurations import SERVICE_HTTPS_DEFAULT, ServiceConfiguration +from dstack._internal.core.models.configurations import ( + DEFAULT_REPLICA_GROUP_NAME, + SERVICE_HTTPS_DEFAULT, + ServiceConfiguration, +) from dstack._internal.core.models.gateways import GatewayConfiguration, GatewayStatus from dstack._internal.core.models.instances import SSHConnectionParams from dstack._internal.core.models.runs import JobSpec, Run, RunSpec, ServiceModelSpec, ServiceSpec @@ -318,8 +322,18 @@ async def update_service_desired_replica_count( prev_counts = ( json.loads(run_model.desired_replica_counts) if run_model.desired_replica_counts else {} ) + if ( + prev_counts == {} + and len(replica_groups) == 1 + and replica_groups[0].name == DEFAULT_REPLICA_GROUP_NAME + ): + # Special case to avoid dropping the replica count to group.count.min + # when a 0.20.7+ server first processes a service created by a pre-0.20.7 server. + # TODO: remove once most users upgrade to 0.20.7+. + prev_counts = {DEFAULT_REPLICA_GROUP_NAME: run_model.desired_replica_count} for group in replica_groups: scaler = get_service_scaler(group.count, group.scaling) + assert group.name is not None, "Group name is always set" group_desired = scaler.get_desired_count( current_desired_count=prev_counts.get(group.name, group.count.min or 0), stats=stats, From bb0278886dd4cb662b4c60fdc52ab4879779e09d Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 28 Jan 2026 22:39:07 +0545 Subject: [PATCH 094/187] [Docs] Replica groups (#3511) * Add Replica Groups Docs * Minor edits --------- Co-authored-by: Bihan Rana Co-authored-by: peterschmidt85 --- docs/docs/concepts/services.md | 51 ++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index 745f78e3f0..0f6bf07bb8 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -164,6 +164,57 @@ Setting the minimum number of replicas to `0` allows the service to scale down t > The `scaling` property requires creating a [gateway](gateways.md). +??? info "Replica groups" + A service can include multiple replica groups. Each group can define its own `commands`, `resources` requirements, and `scaling` rules. + +
+ + ```yaml + type: service + name: llama-8b-service + + image: lmsysorg/sglang:latest + env: + - MODEL_ID=deepseek-ai/DeepSeek-R1-Distill-Llama-8B + + replicas: + - count: 1..2 + scaling: + metric: rps + target: 10 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --port 8000 \ + --trust-remote-code + resources: + gpu: 48GB + + - count: 1..4 + scaling: + metric: rps + target: 5 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --port 8000 \ + --trust-remote-code + resources: + gpu: 24GB + + port: 8000 + model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B + ``` + +
+ + > Properties such as `regions`, `port`, `image`, `env` and some other cannot be configured per replica group. This support is coming soon. + +??? info "Disaggregated serving" + Native support for disaggregated prefill and decode, allowing both worker types to run within a single service, is coming soon. + ### Model If the service is running a chat model with an OpenAI-compatible interface, From 0d58216d0ea088df7281130a4d45ed9996972040 Mon Sep 17 00:00:00 2001 From: Oleg Vavilov Date: Wed, 28 Jan 2026 22:51:40 +0300 Subject: [PATCH 095/187] Showed counters for Project and User List --- frontend/src/pages/Events/List/hooks/useFilters.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/frontend/src/pages/Events/List/hooks/useFilters.ts b/frontend/src/pages/Events/List/hooks/useFilters.ts index e6a97c71e8..d9457f9064 100644 --- a/frontend/src/pages/Events/List/hooks/useFilters.ts +++ b/frontend/src/pages/Events/List/hooks/useFilters.ts @@ -148,22 +148,22 @@ export const useFilters = () => { { key: filterKeys.TARGET_FLEETS, operators: ['='], - propertyLabel: 'Target fleets', + propertyLabel: 'Target fleet IDs', }, { key: filterKeys.TARGET_INSTANCES, operators: ['='], - propertyLabel: 'Target instances', + propertyLabel: 'Target instance IDs', }, { key: filterKeys.TARGET_RUNS, operators: ['='], - propertyLabel: 'Target runs', + propertyLabel: 'Target run IDs', }, { key: filterKeys.TARGET_JOBS, operators: ['='], - propertyLabel: 'Target jobs', + propertyLabel: 'Target job IDs', }, { key: filterKeys.TARGET_VOLUMES, @@ -191,13 +191,13 @@ export const useFilters = () => { { key: filterKeys.WITHIN_FLEETS, operators: ['='], - propertyLabel: 'Within fleets', + propertyLabel: 'Within fleet IDs', }, { key: filterKeys.WITHIN_RUNS, operators: ['='], - propertyLabel: 'Within runs', + propertyLabel: 'Within run IDs', }, { From 916b94e3388b74614443cbc2043508e53eddbe25 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Thu, 29 Jan 2026 09:42:00 +0100 Subject: [PATCH 096/187] [Docs] Added `Spot policy` (#3512) --- docs/docs/concepts/fleets.md | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/docs/docs/concepts/fleets.md b/docs/docs/concepts/fleets.md index 4def218456..dad06cb80a 100644 --- a/docs/docs/concepts/fleets.md +++ b/docs/docs/concepts/fleets.md @@ -281,12 +281,38 @@ Pre-provisioning is supported only for [VM-based backends](backends.md#vm-based) Backend fleets allow you to specify the resource requirements for the instances to be provisioned. The `resources` property syntax is the same as for [run configurations](dev-environments.md#resources). -> Not directly related, but in addition to `resources`, you can specify [`spot_policy`](../reference/dstack.yml/fleet.md#instance_types), [`instance_types`](../reference/dstack.yml/fleet.md#instance_types), [`max_price`](../reference/dstack.yml/fleet.md#max_price), [`region`](../reference/dstack.yml/fleet.md#max_price), and other [options](../reference/dstack.yml/fleet.md#). +### Spot policy - +Backend fleets allow you to specify a `spot policy`. By default, it is set to `on-demand`. If you want to use spot instances, you must set it to `auto` if you plan to use both on-demand and spot instances, or to `spot` if only spot instances are allowed. + +
+ +```yaml +type: fleet +name: my-fleet + +nodes: 0..2 + +# Uncomment to ensure instances are inter-connected +#placement: cluster + +# Allows both on-demand and spot +spot_policy: auto + +idle_duration: 1h + +resources: + gpu: 0..8 +``` + +
+ +Note that run configurations must specify their own `spot policy` which is also set to `on-demand` by default. ### Backends +Backend fleets allow you to set `backends` to specify which backends are allowed to be used. + ### Idle duration By default, instances of a backend fleet stay `idle` for 3 days and can be reused within that time. @@ -432,7 +458,7 @@ ssh_config: ``` !!! info "Reference" - The fleet configuration file supports many more options. See the [reference](../reference/dstack.yml/fleet.md). + The fleet configuration file supports additional options, including [`instance_types`](../reference/dstack.yml/fleet.md#instance_types), [`max_price`](../reference/dstack.yml/fleet.md#max_price), [`regions`](../reference/dstack.yml/fleet.md#max_price), among others. For the complete list, see the [reference](../reference/dstack.yml/fleet.md). ## Manage fleets From 9c81898b5c7cac18e4f27608682146a5d31df425 Mon Sep 17 00:00:00 2001 From: Oleg Date: Thu, 29 Jan 2026 12:30:46 +0300 Subject: [PATCH 097/187] Switch UI to pagination-based projects and users API (#3503) Switch UI to pagination-based projects and users API #3490 --- .../App/Login/LoginByGithubCallback/index.tsx | 2 +- .../useCheckingForFleetsInProjectsOfMember.ts | 13 +- frontend/src/hooks/useInfiniteScroll.ts | 61 +++++++-- frontend/src/hooks/useProjectFilter.ts | 10 +- .../layouts/AppLayout/TutorialPanel/hooks.ts | 12 +- .../src/pages/Events/List/hooks/useFilters.ts | 18 +-- frontend/src/pages/Models/List/hooks.tsx | 4 +- frontend/src/pages/Project/List/index.tsx | 98 +++++++------- .../Members/UsersAutosuggest/index.tsx | 4 +- frontend/src/pages/Project/Members/index.tsx | 4 +- .../hooks/useGetRunSpecFromYaml.ts | 1 + .../pages/Runs/CreateDevEnvironment/index.tsx | 1 + .../src/pages/Runs/List/hooks/useFilters.ts | 4 +- .../src/pages/User/Details/Projects/index.tsx | 9 +- frontend/src/pages/User/List/hooks.tsx | 39 ++++++ frontend/src/pages/User/List/index.tsx | 126 +++++++----------- frontend/src/services/project.ts | 17 ++- frontend/src/services/user.ts | 17 ++- frontend/src/types/project.d.ts | 10 +- frontend/src/types/user.d.ts | 12 ++ 20 files changed, 270 insertions(+), 192 deletions(-) create mode 100644 frontend/src/pages/User/List/hooks.tsx diff --git a/frontend/src/App/Login/LoginByGithubCallback/index.tsx b/frontend/src/App/Login/LoginByGithubCallback/index.tsx index af88aa72f1..45be311b6b 100644 --- a/frontend/src/App/Login/LoginByGithubCallback/index.tsx +++ b/frontend/src/App/Login/LoginByGithubCallback/index.tsx @@ -41,7 +41,7 @@ export const LoginByGithubCallback: React.FC = () => { .then(async ({ creds: { token } }) => { dispatch(setAuthData({ token })); if (process.env.UI_VERSION === 'sky') { - const result = await getProjects().unwrap(); + const result = await getProjects({}).unwrap(); if (result?.length === 0) { navigate(ROUTES.PROJECT.ADD); return; diff --git a/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts b/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts index d91b78a3b1..b330358b46 100644 --- a/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts +++ b/frontend/src/hooks/useCheckingForFleetsInProjectsOfMember.ts @@ -5,9 +5,12 @@ import { useGetOnlyNoFleetsProjectsQuery, useGetProjectsQuery } from 'services/p type Args = { projectNames?: IProject['project_name'][] }; export const useCheckingForFleetsInProjects = ({ projectNames }: Args) => { - const { data: projectsData } = useGetProjectsQuery(undefined, { - skip: !!projectNames?.length, - }); + const { data: projectsData } = useGetProjectsQuery( + {}, + { + skip: !!projectNames?.length, + }, + ); const { data: noFleetsProjectsData } = useGetOnlyNoFleetsProjectsQuery(); @@ -16,8 +19,8 @@ export const useCheckingForFleetsInProjects = ({ projectNames }: Args) => { return projectNames; } - if (projectsData) { - return projectsData.map((project) => project.project_name); + if (projectsData?.data) { + return projectsData.data.map((project) => project.project_name); } return []; diff --git a/frontend/src/hooks/useInfiniteScroll.ts b/frontend/src/hooks/useInfiniteScroll.ts index 727586ab00..b11115d63f 100644 --- a/frontend/src/hooks/useInfiniteScroll.ts +++ b/frontend/src/hooks/useInfiniteScroll.ts @@ -9,10 +9,14 @@ const SCROLL_POSITION_GAP = 400; type InfinityListArgs = Partial>; type ListResponse = DataItem[]; +type ResponseWithDataProp = { data: ListResponse; total_count: number }; + +type LazyQueryResponse = ResponseWithDataProp | ListResponse; type UseInfinityParams = { - useLazyQuery: UseLazyQuery, any>>; + useLazyQuery: UseLazyQuery, any>>; args: { limit?: number } & Args; + getResponseItems?: (listItem: DataItem) => Partial; getPaginationParams: (listItem: DataItem) => Partial; skip?: boolean; // options?: UseQueryStateOptions, Record>; @@ -26,22 +30,31 @@ export const useInfiniteScroll = ({ skip, }: UseInfinityParams) => { const [data, setData] = useState>([]); + const [totalCount, setTotalCount] = useState(); const scrollElement = useRef(document.documentElement); const isLoadingRef = useRef(false); + const isDisabledMoreRef = useRef(false); const lastRequestParams = useRef(undefined); - const [disabledMore, setDisabledMore] = useState(false); const { limit, ...argsProp } = args; const lastArgsProps = useRef>(null); const [getItems, { isLoading, isFetching }] = useLazyQuery({ ...args } as Args); const getDataRequest = (params: Args) => { - lastRequestParams.current = params; + if (isEqual(params, lastRequestParams.current)) { + return Promise.reject(); + } - return getItems({ + const request = getItems({ limit, ...params, } as Args).unwrap(); + + request.then(() => { + lastRequestParams.current = { ...params }; + }); + + return request; }; const getEmptyList = () => { @@ -49,9 +62,18 @@ export const useInfiniteScroll = ({ setData([]); - getDataRequest(argsProp as Args).then((result) => { - setDisabledMore(false); - setData(result as ListResponse); + getDataRequest(argsProp as Args).then((result: LazyQueryResponse) => { + // setDisabledMore(false); + isDisabledMoreRef.current = false; + + if ('data' in result) { + setData(result.data as ListResponse); + setTotalCount(result.total_count); + } else { + setData(result as ListResponse); + setTotalCount(); + } + isLoadingRef.current = false; }); }; @@ -64,7 +86,7 @@ export const useInfiniteScroll = ({ }, [argsProp, lastArgsProps, skip]); const getMore = async () => { - if (isLoadingRef.current || disabledMore || skip) { + if (isLoadingRef.current || isDisabledMoreRef.current || skip) { return; } @@ -76,10 +98,20 @@ export const useInfiniteScroll = ({ ...getPaginationParams(data[data.length - 1]), } as Args); - if (result.length > 0) { - setData((prev) => [...prev, ...result]); + let listResponse: ListResponse; + + if ('data' in result) { + listResponse = result.data; + setTotalCount(result.total_count); + } else { + listResponse = result; + setTotalCount(); + } + + if (listResponse.length > 0) { + setData((prev) => [...prev, ...listResponse]); } else { - setDisabledMore(true); + isDisabledMoreRef.current = true; } } catch (e) { console.log(e); @@ -87,7 +119,7 @@ export const useInfiniteScroll = ({ setTimeout(() => { isLoadingRef.current = false; - }, 10); + }, 50); }; useLayoutEffect(() => { @@ -101,7 +133,7 @@ export const useInfiniteScroll = ({ }, [data]); const onScroll = useCallback(() => { - if (disabledMore || isLoadingRef.current) { + if (isDisabledMoreRef.current || isLoadingRef.current) { return; } @@ -112,7 +144,7 @@ export const useInfiniteScroll = ({ if (scrollPositionFromBottom < SCROLL_POSITION_GAP) { getMore().catch(console.log); } - }, [disabledMore, getMore]); + }, [getMore]); useEffect(() => { document.addEventListener('scroll', onScroll); @@ -126,6 +158,7 @@ export const useInfiniteScroll = ({ return { data, + totalCount, isLoading: isLoading || (data.length === 0 && isFetching), isLoadingMore, refreshList: getEmptyList, diff --git a/frontend/src/hooks/useProjectFilter.ts b/frontend/src/hooks/useProjectFilter.ts index b4573b1f8e..58e573d31d 100644 --- a/frontend/src/hooks/useProjectFilter.ts +++ b/frontend/src/hooks/useProjectFilter.ts @@ -16,20 +16,20 @@ export const useProjectFilter = ({ localStorePrefix }: Args) => { null, ); - const { data: projectsData } = useGetProjectsQuery(); + const { data: projectsData } = useGetProjectsQuery({}); const projectOptions = useMemo(() => { - if (!projectsData?.length) return []; + if (!projectsData?.data?.length) return []; - return projectsData.map((project) => ({ label: project.project_name, value: project.project_name })); + return projectsData.data.map((project) => ({ label: project.project_name, value: project.project_name })); }, [projectsData]); useEffect(() => { - if (!projectsData || !selectedProject) { + if (!projectsData?.data || !selectedProject) { return; } - const hasSelectedProject = projectsData.some(({ project_name }) => selectedProject?.value === project_name); + const hasSelectedProject = projectsData.data.some(({ project_name }) => selectedProject?.value === project_name); if (!hasSelectedProject) { setSelectedProject(null); diff --git a/frontend/src/layouts/AppLayout/TutorialPanel/hooks.ts b/frontend/src/layouts/AppLayout/TutorialPanel/hooks.ts index 74d166ba8d..305a711a05 100644 --- a/frontend/src/layouts/AppLayout/TutorialPanel/hooks.ts +++ b/frontend/src/layouts/AppLayout/TutorialPanel/hooks.ts @@ -44,7 +44,7 @@ export const useTutorials = () => { } = useAppSelector(selectTutorialPanel); const { data: userBillingData } = useGetUserBillingInfoQuery({ username: useName ?? '' }, { skip: !useName }); - const { data: projectData } = useGetProjectsQuery(); + const { data: projectData } = useGetProjectsQuery({}); const { data: runsData } = useGetRunsQuery({ limit: 1, }); @@ -54,14 +54,14 @@ export const useTutorials = () => { useEffect(() => { if ( userBillingData && - projectData && + projectData?.data && runsData && !completeIsChecked.current && location.pathname !== ROUTES.PROJECT.ADD ) { const billingCompleted = userBillingData.balance > 0; const configureCLICompleted = runsData.length > 0; - const createProjectCompleted = projectData.length > 0; + const createProjectCompleted = projectData.data.length > 0; let tempHideStartUp = hideStartUp; @@ -88,7 +88,7 @@ export const useTutorials = () => { }, [userBillingData, runsData, projectData, location.pathname]); useEffect(() => { - if (projectData && projectData.length > 0 && !createProjectCompleted) { + if (projectData?.data && projectData.data.length > 0 && !createProjectCompleted) { dispatch( updateTutorialPanelState({ createProjectCompleted: true, @@ -114,8 +114,8 @@ export const useTutorials = () => { }, []); const startConfigCliTutorial = useCallback(() => { - if (projectData?.length) { - navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(projectData[0].project_name)); + if (projectData?.data?.length) { + navigate(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(projectData.data[0].project_name)); } }, [projectData]); diff --git a/frontend/src/pages/Events/List/hooks/useFilters.ts b/frontend/src/pages/Events/List/hooks/useFilters.ts index d9457f9064..535582195b 100644 --- a/frontend/src/pages/Events/List/hooks/useFilters.ts +++ b/frontend/src/pages/Events/List/hooks/useFilters.ts @@ -77,8 +77,8 @@ const targetTypes = [ export const useFilters = () => { const [searchParams, setSearchParams] = useSearchParams(); - const { data: projectsData } = useGetProjectsQuery(); - const { data: usersData } = useGetUserListQuery(); + const { data: projectsData } = useGetProjectsQuery({}); + const { data: usersData } = useGetUserListQuery({}); const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => requestParamsToTokens({ searchParams, filterKeys }), @@ -92,7 +92,7 @@ export const useFilters = () => { const filteringOptions = useMemo(() => { const options: PropertyFilterProps.FilteringOption[] = []; - projectsData?.forEach(({ project_name }) => { + projectsData?.data?.forEach(({ project_name }) => { options.push({ propertyKey: filterKeys.TARGET_PROJECTS, value: project_name, @@ -104,7 +104,7 @@ export const useFilters = () => { }); }); - usersData?.forEach(({ username }) => { + usersData?.data?.forEach(({ username }) => { options.push({ propertyKey: filterKeys.TARGET_USERS, value: username, @@ -247,14 +247,16 @@ export const useFilters = () => { ...(params[filterKeys.TARGET_PROJECTS] && Array.isArray(params[filterKeys.TARGET_PROJECTS]) ? { [filterKeys.TARGET_PROJECTS]: params[filterKeys.TARGET_PROJECTS]?.map( - (name: string) => projectsData?.find(({ project_name }) => project_name === name)?.['project_id'], + (name: string) => + projectsData?.data?.find(({ project_name }) => project_name === name)?.['project_id'], ), } : {}), ...(params[filterKeys.WITHIN_PROJECTS] && Array.isArray(params[filterKeys.WITHIN_PROJECTS]) ? { [filterKeys.WITHIN_PROJECTS]: params[filterKeys.WITHIN_PROJECTS]?.map( - (name: string) => projectsData?.find(({ project_name }) => project_name === name)?.['project_id'], + (name: string) => + projectsData?.data?.find(({ project_name }) => project_name === name)?.['project_id'], ), } : {}), @@ -262,7 +264,7 @@ export const useFilters = () => { ...(params[filterKeys.TARGET_USERS] && Array.isArray(params[filterKeys.TARGET_USERS]) ? { [filterKeys.TARGET_USERS]: params[filterKeys.TARGET_USERS]?.map( - (name: string) => usersData?.find(({ username }) => username === name)?.['id'], + (name: string) => usersData?.data?.find(({ username }) => username === name)?.['id'], ), } : {}), @@ -270,7 +272,7 @@ export const useFilters = () => { ...(params[filterKeys.ACTORS] && Array.isArray(params[filterKeys.ACTORS]) ? { [filterKeys.ACTORS]: params[filterKeys.ACTORS]?.map( - (name: string) => usersData?.find(({ username }) => username === name)?.['id'], + (name: string) => usersData?.data?.find(({ username }) => username === name)?.['id'], ), } : {}), diff --git a/frontend/src/pages/Models/List/hooks.tsx b/frontend/src/pages/Models/List/hooks.tsx index 912875b394..461bf28a3b 100644 --- a/frontend/src/pages/Models/List/hooks.tsx +++ b/frontend/src/pages/Models/List/hooks.tsx @@ -129,7 +129,7 @@ const filterKeys: Record = { export const useFilters = (localStorePrefix = 'models-list-page') => { const [searchParams, setSearchParams] = useSearchParams(); const { projectOptions } = useProjectFilter({ localStorePrefix }); - const { data: usersData } = useGetUserListQuery(); + const { data: usersData } = useGetUserListQuery({}); const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => requestParamsToTokens({ searchParams, filterKeys }), @@ -151,7 +151,7 @@ export const useFilters = (localStorePrefix = 'models-list-page') => { }); }); - usersData?.forEach(({ username }) => { + usersData?.data?.forEach(({ username }) => { options.push({ propertyKey: filterKeys.USER_NAME, value: username, diff --git a/frontend/src/pages/Project/List/index.tsx b/frontend/src/pages/Project/List/index.tsx index af4afcf5dc..eb896df91d 100644 --- a/frontend/src/pages/Project/List/index.tsx +++ b/frontend/src/pages/Project/List/index.tsx @@ -1,44 +1,36 @@ -import React, { useMemo } from 'react'; +import React, { useMemo, useState } from 'react'; import { useTranslation } from 'react-i18next'; import { useNavigate } from 'react-router-dom'; -import { get as _get } from 'lodash'; - -import { - Button, - ButtonWithConfirmation, - Header, - ListEmptyMessage, - Pagination, - SpaceBetween, - Table, - TextFilter, -} from 'components'; - -import { useBreadcrumbs, useCollection } from 'hooks'; + +import { Button, ButtonWithConfirmation, Header, ListEmptyMessage, Loader, SpaceBetween, Table, TextFilter } from 'components'; + +import { DEFAULT_TABLE_PAGE_SIZE } from 'consts'; +import { useBreadcrumbs, useCollection, useInfiniteScroll } from 'hooks'; import { ROUTES } from 'routes'; -import { useGetProjectsQuery } from 'services/project'; +import { useLazyGetProjectsQuery } from 'services/project'; import { useCheckAvailableProjectPermission } from '../hooks/useCheckAvailableProjectPermission'; import { useDeleteProject } from '../hooks/useDeleteProject'; import { useColumnsDefinitions } from './hooks'; -const SEARCHABLE_COLUMNS = ['project_name', 'owner.username']; - export const ProjectList: React.FC = () => { const { t } = useTranslation(); - const { isLoading, isFetching, data, refetch } = useGetProjectsQuery(); const { isAvailableDeletingPermission, isAvailableProjectManaging } = useCheckAvailableProjectPermission(); const { deleteProject, deleteProjects, isDeleting } = useDeleteProject(); + const [filteringText, setFilteringText] = useState(''); + const [namePattern, setNamePattern] = useState(''); const navigate = useNavigate(); - const sortedData = useMemo(() => { - if (!data) return []; + const { data, isLoading, refreshList, isLoadingMore, totalCount } = useInfiniteScroll({ + useLazyQuery: useLazyGetProjectsQuery, + args: { name_pattern: namePattern, limit: DEFAULT_TABLE_PAGE_SIZE }, - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-ignore - return [...data].sort((a, b) => new Date(b.created_at) - new Date(a.created_at)); - }, [data]); + getPaginationParams: (lastProject) => ({ + prev_created_at: lastProject.created_at, + prev_id: lastProject.project_id, + }), + }); useBreadcrumbs([ { @@ -51,7 +43,24 @@ export const ProjectList: React.FC = () => { navigate(ROUTES.PROJECT.ADD); }; + const onClearFilter = () => { + setNamePattern(''); + setFilteringText(''); + }; + const renderEmptyMessage = (): React.ReactNode => { + if (isLoading) { + return null; + } + + if (filteringText) { + return ( + + + + ); + } + return ( {isAvailableProjectManaging && } @@ -59,28 +68,10 @@ export const ProjectList: React.FC = () => { ); }; - const renderNoMatchMessage = (onClearFilter: () => void): React.ReactNode => { - return ( - - - - ); - }; - - const { items, actions, filteredItemsCount, collectionProps, filterProps, paginationProps } = useCollection(sortedData, { + const { items, collectionProps } = useCollection(data, { filtering: { empty: renderEmptyMessage(), - noMatch: renderNoMatchMessage(() => actions.setFiltering('')), - - filteringFunction: (projectItem: IProject, filteringText) => { - const filteringTextLowerCase = filteringText.toLowerCase(); - - return SEARCHABLE_COLUMNS.map((key) => _get(projectItem, key)).some( - (value) => typeof value === 'string' && value.trim().toLowerCase().indexOf(filteringTextLowerCase) > -1, - ); - }, }, - pagination: { pageSize: 20 }, selection: {}, }); @@ -104,9 +95,9 @@ export const ProjectList: React.FC = () => { }); const renderCounter = () => { - if (!data?.length) return ''; + if (typeof totalCount !== 'number') return ''; - return `(${data.length})`; + return `(${totalCount})`; }; return ( @@ -116,7 +107,7 @@ export const ProjectList: React.FC = () => { variant="full-page" columnDefinitions={columns} items={items} - loading={isLoading || isFetching} + loading={isLoading} loadingText={t('common.loading')} selectionType={isAvailableProjectManaging ? 'multi' : undefined} stickyHeader={true} @@ -141,9 +132,9 @@ export const ProjectList: React.FC = () => { + + ); + } + return ( @@ -91,22 +76,10 @@ export const UserList: React.FC = () => { ); }; - const renderNoMatchMessage = (onClearFilter: () => void): React.ReactNode => { - return ( - - - - ); - }; - - const { items, actions, filteredItemsCount, collectionProps, filterProps, paginationProps } = useCollection(sortedData, { + const { items, actions, collectionProps } = useCollection(data, { filtering: { empty: renderEmptyMessage(), - noMatch: renderNoMatchMessage(() => actions.setFiltering('')), - filteringFunction: (user, filteringText) => - includeSubString(user.username, filteringText) || includeSubString(user.email ?? '', filteringText), }, - pagination: { pageSize: 20 }, selection: {}, }); @@ -145,13 +118,9 @@ export const UserList: React.FC = () => { }, [collectionProps.selectedItems]); const renderCounter = () => { - const { selectedItems } = collectionProps; - - if (!data?.length) return ''; - - if (selectedItems?.length) return `(${selectedItems?.length}/${data?.length ?? 0})`; + if (typeof totalCount !== 'number') return ''; - return `(${data.length})`; + return `(${totalCount})`; }; return ( @@ -160,9 +129,11 @@ export const UserList: React.FC = () => { {...collectionProps} variant="full-page" isItemDisabled={getIsTableItemDisabled} - columnDefinitions={COLUMN_DEFINITIONS} + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + columnDefinitions={columns} items={items} - loading={isLoading || isFetching} + loading={isLoading} loadingText={t('common.loading')} selectionType="multi" stickyHeader={true} @@ -186,9 +157,9 @@ export const UserList: React.FC = () => {
- + + } + /> + ); + }} + permanentFilters={{ within_projects: [paramProjectName] }} + showFilters={false} + /> + ); +}; diff --git a/frontend/src/pages/Project/Details/index.tsx b/frontend/src/pages/Project/Details/index.tsx index b26c921a05..f667319eb2 100644 --- a/frontend/src/pages/Project/Details/index.tsx +++ b/frontend/src/pages/Project/Details/index.tsx @@ -1,15 +1,49 @@ -import React from 'react'; -import { Outlet, useParams } from 'react-router-dom'; +import React, { useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Outlet, useMatch, useParams } from 'react-router-dom'; -import { ContentLayout, DetailsHeader } from 'components'; +import { ContentLayout, DetailsHeader, Tabs } from 'components'; + +import { ROUTES } from 'routes'; + +import styles from './styles.module.scss'; export const ProjectDetails: React.FC = () => { const params = useParams(); const paramProjectName = params.projectName ?? ''; + const { t } = useTranslation(); + + const matchSettings = useMatch(ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName)); + const matchEvents = useMatch(ROUTES.PROJECT.DETAILS.EVENTS.FORMAT(paramProjectName)); + + const tabs: { + label: string; + id: string; + href: string; + }[] = [ + { + label: t('projects.settings'), + id: 'settings', + href: ROUTES.PROJECT.DETAILS.SETTINGS.FORMAT(paramProjectName), + }, + { + label: t('projects.events'), + id: 'events', + href: ROUTES.PROJECT.DETAILS.EVENTS.FORMAT(paramProjectName), + }, + ].filter(Boolean); + + const showTabs = useMemo(() => { + return Boolean(matchSettings) || Boolean(matchEvents); + }, [matchSettings, matchEvents]); return ( - }> - - +
+ }> + {showTabs && } + + + +
); }; diff --git a/frontend/src/pages/Project/Details/styles.module.scss b/frontend/src/pages/Project/Details/styles.module.scss new file mode 100644 index 0000000000..1a7d41a9c5 --- /dev/null +++ b/frontend/src/pages/Project/Details/styles.module.scss @@ -0,0 +1,18 @@ +.page { + height: 100%; + + & [class^="awsui_tabs-content"] { + display: none; + } + + & > [class^="awsui_layout"] { + height: 100%; + + & > [class^="awsui_content"] { + display: flex; + flex-direction: column; + gap: 20px; + height: 100%; + } + } +} diff --git a/frontend/src/pages/Project/index.tsx b/frontend/src/pages/Project/index.tsx index a7bdc1617e..503d4c2228 100644 --- a/frontend/src/pages/Project/index.tsx +++ b/frontend/src/pages/Project/index.tsx @@ -2,6 +2,7 @@ import React from 'react'; export { ProjectList } from './List'; export { ProjectDetails } from './Details'; export { ProjectSettings } from './Details/Settings'; +export { Events as ProjectEvents } from './Details/Events'; export { ProjectAdd } from './Add'; export { CreateProjectWizard } from './CreateWizard'; diff --git a/frontend/src/pages/User/Details/Events/index.tsx b/frontend/src/pages/User/Details/Events/index.tsx new file mode 100644 index 0000000000..3141d6f33a --- /dev/null +++ b/frontend/src/pages/User/Details/Events/index.tsx @@ -0,0 +1,64 @@ +import React, { useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; + +import { Button, Header, SegmentedControl, SpaceBetween } from 'components'; + +import { useBreadcrumbs } from 'hooks'; +import { ROUTES } from 'routes'; + +import { EventList } from 'pages/Events/List'; + +export const Events: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramUserName = params.userName ?? ''; + const navigate = useNavigate(); + const [filterParamName, setFilterParamName] = useState('actors'); + + useBreadcrumbs([ + { + text: t('navigation.account'), + href: ROUTES.USER.LIST, + }, + { + text: paramUserName, + href: ROUTES.USER.DETAILS.FORMAT(paramUserName), + }, + { + text: t('users.events'), + href: ROUTES.USER.EVENTS.FORMAT(paramUserName), + }, + ]); + + const goToEventsPage = () => { + navigate(ROUTES.EVENTS.LIST + `?${filterParamName}=${paramUserName}`); + }; + + return ( + { + return ( +
+ setFilterParamName(detail.selectedId as keyof TEventListFilters)} + options={[ + { text: 'Actor', id: 'actors' }, + { text: 'Target user', id: 'target_users' }, + ]} + /> + + + } + /> + ); + }} + permanentFilters={{ [filterParamName]: [paramUserName] }} + showFilters={false} + /> + ); +}; diff --git a/frontend/src/pages/User/Details/Projects/index.tsx b/frontend/src/pages/User/Details/Projects/index.tsx index 2cb9885ab3..3ce243d978 100644 --- a/frontend/src/pages/User/Details/Projects/index.tsx +++ b/frontend/src/pages/User/Details/Projects/index.tsx @@ -29,15 +29,14 @@ export const UserProjectList: React.FC = () => { text: paramUserName, href: ROUTES.USER.DETAILS.FORMAT(paramUserName), }, + { + text: t('users.projects'), + href: ROUTES.USER.PROJECTS.FORMAT(paramUserName), + }, ]); const renderEmptyMessage = (): React.ReactNode => { - return ( - - ); + return ; }; const filteredData = useMemo(() => { @@ -74,7 +73,6 @@ export const UserProjectList: React.FC = () => { return (
{ }; return ( -
-
- {t('common.edit')} - - } - > - {t('users.account_settings')} -
- + + {t('common.edit')} + + } + > + {t('users.account_settings')} + + } + > {isLoading && } {data && ( @@ -105,6 +107,6 @@ export const Settings: React.FC = () => { )} -
+ ); }; diff --git a/frontend/src/pages/User/Details/index.tsx b/frontend/src/pages/User/Details/index.tsx index 8f1b2d393d..3236d9acad 100644 --- a/frontend/src/pages/User/Details/index.tsx +++ b/frontend/src/pages/User/Details/index.tsx @@ -2,7 +2,7 @@ import React, { useEffect, useState } from 'react'; import { useTranslation } from 'react-i18next'; import { Outlet, useNavigate, useParams } from 'react-router-dom'; -import { Box, ConfirmationDialog, ContentLayout, SpaceBetween, Tabs } from 'components'; +import { Box, ConfirmationDialog, ContentLayout, Tabs } from 'components'; import { DetailsHeader } from 'components'; import { useNotifications /* usePermissionGuard*/ } from 'hooks'; @@ -13,8 +13,11 @@ import { useDeleteUsersMutation, useGetUserQuery } from 'services/user'; // import { GlobalUserRole } from '../../../types'; import { UserDetailsTabTypeEnum } from './types'; +import styles from './styles.module.scss'; + export { Settings as UserSettings } from './Settings'; export { Billing as UserBilling } from './Billing'; +export { Events as UserEvents } from './Events'; export { UserProjectList as UserProjects } from './Projects'; export const UserDetails: React.FC = () => { @@ -63,24 +66,26 @@ export const UserDetails: React.FC = () => { label: t('users.settings'), id: UserDetailsTabTypeEnum.SETTINGS, href: ROUTES.USER.DETAILS.FORMAT(paramUserName), - content: , }, { label: t('users.projects'), id: UserDetailsTabTypeEnum.PROJECTS, href: ROUTES.USER.PROJECTS.FORMAT(paramUserName), - content: , + }, + { + label: t('users.events'), + id: UserDetailsTabTypeEnum.EVENTS, + href: ROUTES.USER.EVENTS.FORMAT(paramUserName), }, process.env.UI_VERSION === 'sky' && { label: t('billing.title'), id: UserDetailsTabTypeEnum.BILLING, href: ROUTES.USER.BILLING.LIST.FORMAT(paramUserName), - content: , }, ].filter(Boolean); return ( - <> +
{ /> } > - - - + + + { onConfirm={deleteUserHandler} confirmButtonLabel={t('common.delete')} /> - +
); }; diff --git a/frontend/src/pages/User/Details/styles.module.scss b/frontend/src/pages/User/Details/styles.module.scss new file mode 100644 index 0000000000..1a7d41a9c5 --- /dev/null +++ b/frontend/src/pages/User/Details/styles.module.scss @@ -0,0 +1,18 @@ +.page { + height: 100%; + + & [class^="awsui_tabs-content"] { + display: none; + } + + & > [class^="awsui_layout"] { + height: 100%; + + & > [class^="awsui_content"] { + display: flex; + flex-direction: column; + gap: 20px; + height: 100%; + } + } +} diff --git a/frontend/src/pages/User/Details/types.ts b/frontend/src/pages/User/Details/types.ts index 8ced1c3c29..9f2a0680ac 100644 --- a/frontend/src/pages/User/Details/types.ts +++ b/frontend/src/pages/User/Details/types.ts @@ -1,5 +1,7 @@ export enum UserDetailsTabTypeEnum { SETTINGS = 'settings', PROJECTS = 'projects', + EVENTS = 'events', + ACTIVITY = 'activity', BILLING = 'billing', } diff --git a/frontend/src/router.tsx b/frontend/src/router.tsx index 34a8abaaf0..a5f2b50bd4 100644 --- a/frontend/src/router.tsx +++ b/frontend/src/router.tsx @@ -17,7 +17,7 @@ import { FleetInspect } from 'pages/Fleets/Details/Inspect'; import { InstanceList } from 'pages/Instances'; import { ModelsList } from 'pages/Models'; import { ModelDetails } from 'pages/Models/Details'; -import { CreateProjectWizard, ProjectAdd, ProjectDetails, ProjectList, ProjectSettings } from 'pages/Project'; +import { CreateProjectWizard, ProjectAdd, ProjectDetails, ProjectEvents, ProjectList, ProjectSettings } from 'pages/Project'; import { BackendAdd, BackendEdit } from 'pages/Project/Backends'; import { AddGateway, EditGateway } from 'pages/Project/Gateways'; import { @@ -33,7 +33,7 @@ import { RunInspect } from 'pages/Runs/Details/Inspect'; import { JobDetailsPage } from 'pages/Runs/Details/Jobs/Details'; import { EventsList as JobEvents } from 'pages/Runs/Details/Jobs/Events'; import { CreditsHistoryAdd, UserAdd, UserDetails, UserEdit, UserList } from 'pages/User'; -import { UserBilling, UserProjects, UserSettings } from 'pages/User/Details'; +import { UserBilling, UserEvents, UserProjects, UserSettings } from 'pages/User/Details'; import { AuthErrorMessage } from './App/AuthErrorMessage'; import { EventList } from './pages/Events'; @@ -86,6 +86,10 @@ export const router = createBrowserRouter([ index: true, element: , }, + { + path: ROUTES.PROJECT.DETAILS.EVENTS.TEMPLATE, + element: , + }, { path: ROUTES.PROJECT.BACKEND.ADD.TEMPLATE, element: , @@ -258,6 +262,10 @@ export const router = createBrowserRouter([ path: ROUTES.USER.PROJECTS.TEMPLATE, element: , }, + { + path: ROUTES.USER.EVENTS.TEMPLATE, + element: , + }, process.env.UI_VERSION === 'sky' && { path: ROUTES.USER.BILLING.LIST.TEMPLATE, element: , diff --git a/frontend/src/routes.ts b/frontend/src/routes.ts index 288cef72fc..7922354e19 100644 --- a/frontend/src/routes.ts +++ b/frontend/src/routes.ts @@ -23,6 +23,11 @@ export const ROUTES = { FORMAT: (projectName: string) => buildRoute(ROUTES.PROJECT.DETAILS.SETTINGS.TEMPLATE, { projectName }), }, + EVENTS: { + TEMPLATE: `/projects/:projectName/events`, + FORMAT: (projectName: string) => buildRoute(ROUTES.PROJECT.DETAILS.EVENTS.TEMPLATE, { projectName }), + }, + RUNS: { DETAILS: { TEMPLATE: `/projects/:projectName/runs/:runId`, @@ -181,6 +186,10 @@ export const ROUTES = { TEMPLATE: `/users/:userName/projects`, FORMAT: (userName: string) => buildRoute(ROUTES.USER.PROJECTS.TEMPLATE, { userName }), }, + EVENTS: { + TEMPLATE: `/users/:userName/events`, + FORMAT: (userName: string) => buildRoute(ROUTES.USER.EVENTS.TEMPLATE, { userName }), + }, BILLING: { LIST: { TEMPLATE: `/users/:userName/billing`, diff --git a/frontend/src/types/event.d.ts b/frontend/src/types/event.d.ts index dd0147fe15..0afdb7436f 100644 --- a/frontend/src/types/event.d.ts +++ b/frontend/src/types/event.d.ts @@ -1,6 +1,6 @@ declare type TEventTargetType = 'project' | 'user' | 'fleet' | 'instance' | 'run' | 'job' | 'volume' | 'gateway' | 'secret'; -declare type TEventListRequestParams = Omit & { +declare type TEventListFilters = { prev_recorded_at?: string; target_projects?: string[]; target_users?: string[]; @@ -17,6 +17,7 @@ declare type TEventListRequestParams = Omit & TEventListFilters; declare interface IEventTarget { type: 'project' | 'user' | 'fleet' | 'instance' | 'run' | 'job' | 'volume' | 'gateway' | 'secret'; From 2fd45cc974bf2f4db2efbabcc51c40417ae7e355 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Fri, 6 Feb 2026 07:48:01 +0000 Subject: [PATCH 122/187] [runner] Write termination_{reason,message} to the log (#3550) --- runner/internal/executor/base.go | 4 ++-- runner/internal/executor/executor.go | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/runner/internal/executor/base.go b/runner/internal/executor/base.go index 4961180e99..99e32250cb 100644 --- a/runner/internal/executor/base.go +++ b/runner/internal/executor/base.go @@ -18,8 +18,8 @@ type Executor interface { SetJobStateWithTerminationReason( ctx context.Context, state types.JobState, - termination_reason types.TerminationReason, - termination_message string, + terminationReason types.TerminationReason, + terminationMessage string, ) SetRunnerState(state string) WriteFileArchive(id string, src io.Reader) error diff --git a/runner/internal/executor/executor.go b/runner/internal/executor/executor.go index cd3bd1be99..ea2ef63930 100644 --- a/runner/internal/executor/executor.go +++ b/runner/internal/executor/executor.go @@ -296,7 +296,7 @@ func (ex *RunExecutor) SetJobState(ctx context.Context, state types.JobState) { } func (ex *RunExecutor) SetJobStateWithTerminationReason( - ctx context.Context, state types.JobState, termination_reason types.TerminationReason, termination_message string, + ctx context.Context, state types.JobState, terminationReason types.TerminationReason, terminationMessage string, ) { ex.mu.Lock() ex.jobStateHistory = append( @@ -304,11 +304,14 @@ func (ex *RunExecutor) SetJobStateWithTerminationReason( schemas.JobStateEvent{ State: state, Timestamp: ex.timestamp.Next(), - TerminationReason: termination_reason, - TerminationMessage: termination_message, + TerminationReason: terminationReason, + TerminationMessage: terminationMessage, }, ) ex.mu.Unlock() + if terminationReason != "" { + ctx = log.AppendArgsCtx(ctx, "termination_reason", terminationReason, "termination_message", terminationMessage) + } log.Info(ctx, "Job state changed", "new", state) } From fa001e580eb72d8e342f8683a1a730953fa7e425 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 6 Feb 2026 14:48:49 +0500 Subject: [PATCH 123/187] Disable autoflush (#3553) * Fix _get_next_instance_num rely on autoflush * Fix TestSwitchInstanceStatus rely on autoflush * Fix long write transaction when cleaning up placement groups --- .../tasks/process_submitted_jobs.py | 33 +++++++------------ src/dstack/_internal/server/db.py | 2 ++ .../server/services/test_instances.py | 4 +-- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py index 2320394436..a021096613 100644 --- a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py @@ -110,7 +110,6 @@ get_fleet_placement_group_models, get_placement_group_model_for_job, placement_group_model_to_placement_group_optional, - schedule_fleet_placement_groups_deletion, ) from dstack._internal.server.services.runs import ( run_model_to_run, @@ -481,17 +480,15 @@ async def _process_submitted_job( logger.info("%s: provisioned %s new instance(s)", fmt(job_model), len(provisioned_jobs)) provisioned_job_models = _get_job_models_for_jobs(run_model.jobs, provisioned_jobs) instance = None # Instance for attaching volumes in case of single job provisioned + # FIXME: Fleet is not locked which may lead to duplicate instance_num. + # This is currently hard to fix without locking the fleet for entire provisioning duration. + # Processing should be done in multiple steps so that + # InstanceModel is created before provisioning. + taken_instance_nums = await _get_taken_instance_nums(session, fleet_model) for provisioned_job_model, jpd in zip(provisioned_job_models, jpds): provisioned_job_model.job_provisioning_data = jpd.json() switch_job_status(session, provisioned_job_model, JobStatus.PROVISIONING) - # FIXME: Fleet is not locked which may lead to duplicate instance_num. - # This is currently hard to fix without locking the fleet for entire provisioning duration. - # Processing should be done in multiple steps so that - # InstanceModel is created before provisioning. - instance_num = await _get_next_instance_num( - session=session, - fleet_model=fleet_model, - ) + instance_num = get_next_instance_num(taken_instance_nums) instance = _create_instance_model_for_job( project=project, fleet_model=fleet_model, @@ -502,6 +499,7 @@ async def _process_submitted_job( instance_num=instance_num, profile=effective_profile, ) + taken_instance_nums.add(instance_num) provisioned_job_model.job_runtime_data = _prepare_job_runtime_data( offer, multinode ).json() @@ -847,15 +845,9 @@ async def _run_jobs_on_new_instances( finally: if fleet_model is not None and len(fleet_model.instances) == 0: # Clean up placement groups that did not end up being used. - # Flush to update still uncommitted placement groups. - await session.flush() - await schedule_fleet_placement_groups_deletion( - session=session, - fleet_id=fleet_model.id, - except_placement_group_ids=( - [placement_group_model.id] if placement_group_model is not None else [] - ), - ) + for pg in placement_group_models: + if placement_group_model is None or pg.id != placement_group_model.id: + pg.fleet_deleted = True return None @@ -906,15 +898,14 @@ async def _create_fleet_model_for_job( return fleet_model -async def _get_next_instance_num(session: AsyncSession, fleet_model: FleetModel) -> int: +async def _get_taken_instance_nums(session: AsyncSession, fleet_model: FleetModel) -> set[int]: res = await session.execute( select(InstanceModel.instance_num).where( InstanceModel.fleet_id == fleet_model.id, InstanceModel.deleted.is_(False), ) ) - taken_instance_nums = set(res.scalars().all()) - return get_next_instance_num(taken_instance_nums) + return set(res.scalars().all()) def _create_instance_model_for_job( diff --git a/src/dstack/_internal/server/db.py b/src/dstack/_internal/server/db.py index c9ed8d5280..5f43f52e0a 100644 --- a/src/dstack/_internal/server/db.py +++ b/src/dstack/_internal/server/db.py @@ -33,6 +33,8 @@ def __init__(self, url: str, engine: Optional[AsyncEngine] = None): self.session_maker = async_sessionmaker( bind=self.engine, # type: ignore[assignment] expire_on_commit=False, + # Disable autoflush to avoid accidental long write transactions on SQLite. + autoflush=False, class_=AsyncSession, ) diff --git a/src/tests/_internal/server/services/test_instances.py b/src/tests/_internal/server/services/test_instances.py index ca6432d61e..4883e309cc 100644 --- a/src/tests/_internal/server/services/test_instances.py +++ b/src/tests/_internal/server/services/test_instances.py @@ -40,7 +40,7 @@ async def test_includes_termination_reason_in_event_messages_only_once( instance.termination_reason_message = "Some err" instances_services.switch_instance_status(session, instance, InstanceStatus.TERMINATING) instances_services.switch_instance_status(session, instance, InstanceStatus.TERMINATED) - + await session.commit() events = await list_events(session) assert len(events) == 2 assert {e.message for e in events} == { @@ -61,7 +61,7 @@ async def test_includes_termination_reason_in_event_message_when_switching_direc instance.termination_reason = InstanceTerminationReason.ERROR instance.termination_reason_message = "Some err" instances_services.switch_instance_status(session, instance, InstanceStatus.TERMINATED) - + await session.commit() events = await list_events(session) assert len(events) == 1 assert events[0].message == ( From 8ff914b1ce32411c5522113b0e45918fac9b2fba Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Fri, 6 Feb 2026 17:43:16 +0100 Subject: [PATCH 124/187] Update SKILL.md with authentication details and OpenAI model usage instructions (#3554) * Update SKILL.md with authentication details and OpenAI model usage instructions * Update SKILL.md to use placeholder values for dstack token and model name in API example --- skills/dstack/SKILL.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/skills/dstack/SKILL.md b/skills/dstack/SKILL.md index 1f32c089d0..97747849f6 100644 --- a/skills/dstack/SKILL.md +++ b/skills/dstack/SKILL.md @@ -219,7 +219,14 @@ resources: **Service endpoints:** - Without gateway: `/proxy/services///` - With gateway: `https://./` -- For OpenAI-compatible models, use the `/v1/...` paths under the service URL and pass the dstack token in the `Authorization` header. +- Authentication: Unless `auth` is `false`, include `Authorization: Bearer ` on all service requests. +- OpenAI-compatible models: Use `service.url` from `dstack run get --json` and append `/v1` as the base URL; do **not** use deprecated `service.model.base_url` for requests. +- Example (with gateway): + ```bash + curl -sS -X POST "https://./v1/chat/completions" \ + -H "Authorization: Bearer " \ + -H "Content-Type: application/json" \ + -d '{"model":"","messages":[{"role":"user","content":"Hello"}],"max_tokens":64}' [Concept documentation](https://dstack.ai/docs/concepts/services.md) | [Configuration reference](https://dstack.ai/docs/reference/dstack.yml/service.md) @@ -235,7 +242,7 @@ resources: gpu: 24GB.. disk: 200GB -spot_policy: auto +spot_policy: auto # other values: spot, on-demand idle_duration: 5m ``` From cfea44e0f4fbcf37bdb3ec5cd15c79789bf87b9d Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Fri, 6 Feb 2026 18:02:51 +0100 Subject: [PATCH 125/187] Update SKILL.md to standardize run name formatting and add permissions guardrail for `dstack attach` (#3555) --- skills/dstack/SKILL.md | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/skills/dstack/SKILL.md b/skills/dstack/SKILL.md index 97747849f6..8bad2b5c79 100644 --- a/skills/dstack/SKILL.md +++ b/skills/dstack/SKILL.md @@ -101,27 +101,29 @@ If you need to prompt for next actions, be explicit about the dstack step and co `dstack attach` runs until interrupted and blocks the terminal. **Agents must avoid indefinite blocking.** If a brief attach is needed, use a timeout to capture initial output (IDE link, SSH alias) and then detach. -Note: `dstack attach` writes SSH alias info under `~/.dstack/ssh/config` (and may update `~/.ssh/config`) to enable `ssh `, IDE connections, port forwarding, and real-time logs (`dstack attach --logs`). If the sandbox cannot write there, the alias will not be created. +Note: `dstack attach` writes SSH alias info under `~/.dstack/ssh/config` (and may update `~/.ssh/config`) to enable `ssh `, IDE connections, port forwarding, and real-time logs (`dstack attach --logs`). If the sandbox cannot write there, the alias will not be created. + +**Permissions guardrail:** If `dstack attach` fails due to sandbox permissions, request permission escalation to run it outside the sandbox. If escalation isn’t approved or attach still fails, ask the user to run `dstack attach` locally and share the IDE link/SSH alias output. **Background attach (non-blocking default for agents):** ```bash -nohup dstack attach --logs > /tmp/.attach.log 2>&1 & echo $! > /tmp/.attach.pid +nohup dstack attach --logs > /tmp/.attach.log 2>&1 & echo $! > /tmp/.attach.pid ``` Then read the output: ```bash -tail -n 50 /tmp/.attach.log +tail -n 50 /tmp/.attach.log ``` Offer live follow only if asked: ```bash -tail -f /tmp/.attach.log +tail -f /tmp/.attach.log ``` Stop the background attach (preferred): ```bash -kill "$(cat /tmp/.attach.pid)" +kill "$(cat /tmp/.attach.pid)" ``` If the PID file is missing, fall back to a specific match (avoid killing all attaches): ```bash -pkill -f "dstack attach " +pkill -f "dstack attach " ``` **Why this helps:** it keeps the attach session alive (including port forwarding) while the agent remains usable. IDE links and SSH instructions appear in the log file -- surface them and ask whether to open the link (`open ""` on macOS, `xdg-open ""` on Linux) only after explicit approval. @@ -131,7 +133,7 @@ If background attach fails in the sandbox (permissions writing `~/.dstack` or `~ **"Run something":** When the user asks to run a workload (dev environment, task, service), use `dstack apply` with the appropriate configuration. Note: `dstack run` only supports `dstack run get --json` for retrieving run details -- it cannot start workloads. -**"Connect to" or "open" a dev environment:** If a dev environment is already running, use `dstack attach --logs` (agent runs it in the background by default) to surface the IDE URL (`cursor://`, `vscode://`, etc.) and SSH alias. If sandboxed attach fails, request escalation or ask the user to run attach locally and share the link. +**"Connect to" or "open" a dev environment:** If a dev environment is already running, use `dstack attach --logs` (agent runs it in the background by default) to surface the IDE URL (`cursor://`, `vscode://`, etc.) and SSH alias. If sandboxed attach fails, request escalation or ask the user to run attach locally and share the link. ## Configuration types @@ -187,7 +189,7 @@ resources: gpu: A100:40GB:2 ``` -**Port forwarding:** When you specify `ports`, `dstack apply` forwards them to `localhost` while attached. Use `dstack attach ` to reconnect and restore port forwarding. The run name becomes an SSH alias (e.g., `ssh `) for direct access. +**Port forwarding:** When you specify `ports`, `dstack apply` forwards them to `localhost` while attached. Use `dstack attach ` to reconnect and restore port forwarding. The run name becomes an SSH alias (e.g., `ssh `) for direct access. **Distributed training:** Multi-node tasks are supported (e.g., via `nodes`) and require fleets that support inter-node communication (see `placement: cluster` in fleets). @@ -217,10 +219,10 @@ resources: ``` **Service endpoints:** -- Without gateway: `/proxy/services///` +- Without gateway: `/proxy/services/f//` - With gateway: `https://./` - Authentication: Unless `auth` is `false`, include `Authorization: Bearer ` on all service requests. -- OpenAI-compatible models: Use `service.url` from `dstack run get --json` and append `/v1` as the base URL; do **not** use deprecated `service.model.base_url` for requests. +- OpenAI-compatible models: Use `service.url` from `dstack run get --json` and append `/v1` as the base URL; do **not** use deprecated `service.model.base_url` for requests. - Example (with gateway): ```bash curl -sS -X POST "https://./v1/chat/completions" \ From fb4a4da8cb3158da94c573b6e557692bcaf7d3ad Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Mon, 9 Feb 2026 15:21:49 +0500 Subject: [PATCH 126/187] Optimize create instance on AWS (#3556) * Implement update_provisioning_data() for AWS * Fix catching and retrying ec2_client.cancel_spot_instance_requests() * Handle ec2_client.cancel_spot_instance_requests() error * Type check backends/aws * Fix log level for Requesting instance offers --- pyproject.toml | 1 + .../_internal/core/backends/aws/compute.py | 139 ++++++++++++------ .../core/backends/base/configurator.py | 4 +- .../server/services/backends/__init__.py | 2 +- 4 files changed, 97 insertions(+), 49 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d9becf693e..8e45dde6bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,6 +101,7 @@ include = [ "src/dstack/plugins", "src/dstack/_internal/server", "src/dstack/_internal/core/services", + "src/dstack/_internal/core/backends/aws", "src/dstack/_internal/core/backends/kubernetes", "src/dstack/_internal/core/backends/runpod", "src/dstack/_internal/cli/services/configurators", diff --git a/src/dstack/_internal/core/backends/aws/compute.py b/src/dstack/_internal/core/backends/aws/compute.py index be3133456c..7c08828780 100644 --- a/src/dstack/_internal/core/backends/aws/compute.py +++ b/src/dstack/_internal/core/backends/aws/compute.py @@ -48,6 +48,7 @@ NoCapacityError, PlacementGroupInUseError, PlacementGroupNotSupportedError, + ProvisioningError, ) from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.common import CoreModel @@ -291,35 +292,35 @@ def create_instance( } if reservation.get("ReservationType") == "capacity-block": is_capacity_block = True - except botocore.exceptions.ClientError as e: logger.warning("Got botocore.exceptions.ClientError: %s", e) raise NoCapacityError() + tried_zones = set() for subnet_id, az in subnet_id_to_az_map.items(): if az in tried_zones: continue tried_zones.add(az) + logger.debug("Trying provisioning %s in %s", instance_offer.instance.name, az) + image_id, username = self._get_image_id_and_username( + ec2_client=ec2_client, + region=instance_offer.region, + gpu_name=( + instance_offer.instance.resources.gpus[0].name + if len(instance_offer.instance.resources.gpus) > 0 + else None + ), + instance_type=instance_offer.instance.name, + image_config=self.config.os_images, + ) + security_group_id = self._create_security_group( + ec2_client=ec2_client, + region=instance_offer.region, + project_id=project_name, + vpc_id=vpc_id, + ) try: - logger.debug("Trying provisioning %s in %s", instance_offer.instance.name, az) - image_id, username = self._get_image_id_and_username( - ec2_client=ec2_client, - region=instance_offer.region, - gpu_name=( - instance_offer.instance.resources.gpus[0].name - if len(instance_offer.instance.resources.gpus) > 0 - else None - ), - instance_type=instance_offer.instance.name, - image_config=self.config.os_images, - ) - security_group_id = self._create_security_group( - ec2_client=ec2_client, - region=instance_offer.region, - project_id=project_name, - vpc_id=vpc_id, - ) - response = ec2_resource.create_instances( + response = ec2_resource.create_instances( # pyright: ignore[reportAttributeAccessIssue] **aws_resources.create_instances_struct( disk_size=disk_size, image_id=image_id, @@ -343,39 +344,85 @@ def create_instance( is_capacity_block=is_capacity_block, ) ) - instance = response[0] - instance.wait_until_running() - instance.reload() # populate instance.public_ip_address - if instance_offer.instance.resources.spot: # it will not terminate the instance - ec2_client.cancel_spot_instance_requests( - SpotInstanceRequestIds=[instance.spot_instance_request_id] - ) - hostname = _get_instance_ip(instance, allocate_public_ip) - return JobProvisioningData( - backend=instance_offer.backend, - instance_type=instance_offer.instance, - instance_id=instance.instance_id, - public_ip_enabled=allocate_public_ip, - hostname=hostname, - internal_ip=instance.private_ip_address, - region=instance_offer.region, - availability_zone=az, - reservation=instance.capacity_reservation_id, - price=instance_offer.price, - username=username, - ssh_port=22, - dockerized=True, # because `dstack-shim` is used - ssh_proxy=None, - backend_data=None, - ) except botocore.exceptions.ClientError as e: logger.warning("Got botocore.exceptions.ClientError: %s", e) if e.response["Error"]["Code"] == "InvalidParameterValue": msg = e.response["Error"].get("Message", "") raise ComputeError(f"Invalid AWS request: {msg}") continue + instance = response[0] + if instance_offer.instance.resources.spot: + # it will not terminate the instance + try: + ec2_client.cancel_spot_instance_requests( + SpotInstanceRequestIds=[instance.spot_instance_request_id] + ) + except Exception: + logger.exception( + "Failed to cancel spot instance request. The instance will be terminated." + ) + self.terminate_instance( + instance_id=instance.instance_id, region=instance_offer.region + ) + raise NoCapacityError() + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=instance.instance_id, + public_ip_enabled=allocate_public_ip, + hostname=None, + internal_ip=None, + region=instance_offer.region, + availability_zone=az, + reservation=instance.capacity_reservation_id, + price=instance_offer.price, + username=username, + ssh_port=None, + dockerized=True, # because `dstack-shim` is used + ssh_proxy=None, + backend_data=None, + ) raise NoCapacityError() + def update_provisioning_data( + self, + provisioning_data: JobProvisioningData, + project_ssh_public_key: str, + project_ssh_private_key: str, + ): + ec2_resource = self.session.resource("ec2", region_name=provisioning_data.region) + instance = ec2_resource.Instance(provisioning_data.instance_id) # pyright: ignore[reportAttributeAccessIssue] + try: + instance.load() + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "InvalidInstanceID.NotFound": + logger.debug( + "Instance %s not found. Waiting for the instance to appear" + " or to timeout if the instance is manually deleted.", + provisioning_data.instance_id, + ) + # Instance may be created but not yet visible to due AWS eventual consistency, + # so we wait instead of failing immediately. + return + raise e + + state = instance.state.get("Name") + if state == "pending": + return + if state in [None, "shutting-down", "terminated", "stopping", "stopped"]: + raise ProvisioningError( + f"Failed to get instance IP address. Instance state is {state}." + ) + if state != "running": + raise ProvisioningError( + f"Failed to get instance IP address. Unknown instance state {state}." + ) + + hostname = _get_instance_ip(instance, self.config.allocate_public_ips) + provisioning_data.hostname = hostname + provisioning_data.internal_ip = instance.private_ip_address + provisioning_data.ssh_port = 22 + def create_placement_group( self, placement_group: PlacementGroup, @@ -478,7 +525,7 @@ def create_gateway( allocate_public_ip=configuration.public_ip, ) try: - response = ec2_resource.create_instances(**instance_struct) + response = ec2_resource.create_instances(**instance_struct) # pyright: ignore[reportAttributeAccessIssue] except botocore.exceptions.ClientError as e: msg = f"AWS Error: {e.response['Error']['Code']}" if e.response["Error"].get("Message"): diff --git a/src/dstack/_internal/core/backends/base/configurator.py b/src/dstack/_internal/core/backends/base/configurator.py index 246d3d6118..11df1ef551 100644 --- a/src/dstack/_internal/core/backends/base/configurator.py +++ b/src/dstack/_internal/core/backends/base/configurator.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, ClassVar, Generic, List, Optional, TypeVar +from typing import Any, ClassVar, Generic, List, NoReturn, Optional, TypeVar from uuid import UUID from dstack._internal.core.backends.base.backend import Backend @@ -110,7 +110,7 @@ def get_backend(self, record: StoredBackendRecord) -> Backend: def raise_invalid_credentials_error( fields: Optional[List[List[str]]] = None, details: Optional[Any] = None -): +) -> NoReturn: msg = BackendInvalidCredentialsError.msg if details: msg += f": {details}" diff --git a/src/dstack/_internal/server/services/backends/__init__.py b/src/dstack/_internal/server/services/backends/__init__.py index ce0f17bde5..93f23e814a 100644 --- a/src/dstack/_internal/server/services/backends/__init__.py +++ b/src/dstack/_internal/server/services/backends/__init__.py @@ -361,7 +361,7 @@ def get_filtered_offers_with_backends( if not exclude_not_available or offer.availability.is_available(): yield (backend, offer) - logger.info("Requesting instance offers from backends: %s", [b.TYPE.value for b in backends]) + logger.debug("Requesting instance offers from backends: %s", [b.TYPE.value for b in backends]) tasks = [run_async(get_offers_tracked, backend, requirements) for backend in backends] offers_by_backend = [] for backend, result in zip(backends, await asyncio.gather(*tasks, return_exceptions=True)): From aa51ea4819c7821369d6e2790a1673722d34a16e Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 11 Feb 2026 09:23:50 +0100 Subject: [PATCH 127/187] Show live progress in `dstack attach` and handle PortUsedError (#3562) - Show a spinner with status table in `dstack attach` while the run is provisioning, matching the UX of `dstack apply`. - Handle `PortUsedError` gracefully in both `dstack attach` and `dstack apply` instead of showing a raw traceback. The error message suggests using `-p` to override the local port mapping. - Store the port number on `PortUsedError` for structured access. --- src/dstack/_internal/cli/commands/attach.py | 50 ++++++++++++++++--- .../cli/services/configurators/run.py | 19 ++++++- .../_internal/core/services/ssh/ports.py | 8 +-- 3 files changed, 66 insertions(+), 11 deletions(-) diff --git a/src/dstack/_internal/cli/commands/attach.py b/src/dstack/_internal/cli/commands/attach.py index e005723e14..a22d63d37c 100644 --- a/src/dstack/_internal/cli/commands/attach.py +++ b/src/dstack/_internal/cli/commands/attach.py @@ -12,8 +12,12 @@ print_finished_message, ) from dstack._internal.cli.utils.common import console, get_start_time +from dstack._internal.cli.utils.rich import MultiItemStatus +from dstack._internal.cli.utils.run import get_runs_table from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT from dstack._internal.core.errors import CLIError +from dstack._internal.core.models.runs import RunStatus +from dstack._internal.core.services.ssh.ports import PortUsedError from dstack._internal.utils.common import get_or_error from dstack.api._public.runs import Run @@ -76,15 +80,39 @@ def _command(self, args: argparse.Namespace): run = self.api.runs.get(args.run_name) if run is None: raise CLIError(f"Run {args.run_name} not found") + + # Show live progress while waiting for the run to be ready + if _is_provisioning(run): + with MultiItemStatus(f"Attaching to [code]{run.name}[/]...", console=console) as live: + while _is_provisioning(run): + live.update(get_runs_table([run])) + time.sleep(5) + run.refresh() + console.print(get_runs_table([run], verbose=run.status == RunStatus.FAILED)) + console.print( + f"\nProvisioning [code]{run.name}[/] completed [secondary]({run.status.value})[/]" + ) + + if run.status.is_finished() and run.status != RunStatus.DONE: + raise CLIError(f"Run {args.run_name} is {run.status.value}") + exit_code = 0 try: - attached = run.attach( - ssh_identity_file=args.ssh_identity_file, - bind_address=args.host, - ports_overrides=args.ports, - replica_num=args.replica, - job_num=args.job, - ) + try: + attached = run.attach( + ssh_identity_file=args.ssh_identity_file, + bind_address=args.host, + ports_overrides=args.ports, + replica_num=args.replica, + job_num=args.job, + ) + except PortUsedError as e: + console.print( + f"[error]Failed to attach: port [code]{e.port}[/code] is already in use." + f" Use [code]-p[/code] in [code]dstack attach[/code] to override the local" + f" port mapping, e.g. [code]-p {e.port + 1}:{e.port}[/code].[/]" + ) + exit(1) if not attached: raise CLIError(f"Failed to attach to run {args.run_name}") _print_attached_message( @@ -159,3 +187,11 @@ def _print_attached_message( output += f"To connect to the run via SSH, use `ssh {name}`.\n" output += "Press Ctrl+C to detach..." console.print(output) + + +def _is_provisioning(run: Run) -> bool: + return run.status in ( + RunStatus.SUBMITTED, + RunStatus.PENDING, + RunStatus.PROVISIONING, + ) diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py index 1077eff8a9..33ba4e10d2 100644 --- a/src/dstack/_internal/cli/services/configurators/run.py +++ b/src/dstack/_internal/cli/services/configurators/run.py @@ -56,6 +56,7 @@ InvalidRepoCredentialsError, get_repo_creds_and_default_branch, ) +from dstack._internal.core.services.ssh.ports import PortUsedError from dstack._internal.utils.common import local_time from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator from dstack._internal.utils.logging import get_logger @@ -168,6 +169,13 @@ def apply_configuration( ) except ServerClientError as e: raise CLIError(e.msg) + except PortUsedError as e: + console.print( + f"[error]Failed to submit: port [code]{e.port}[/code] is already in use." + f" Use [code]-p[/code] in [code]dstack apply[/code] to override the local" + f" port mapping, e.g. [code]-p {e.port + 1}:{e.port}[/code].[/]" + ) + exit(1) if command_args.detach: detach_message = f"Run [code]{run.name}[/] submitted, detaching..." @@ -206,7 +214,16 @@ def apply_configuration( configurator_args, _BIND_ADDRESS_ARG, None ) try: - if run.attach(bind_address=bind_address): + try: + attached = run.attach(bind_address=bind_address) + except PortUsedError as e: + console.print( + f"[error]Failed to attach: port [code]{e.port}[/code] is already in use." + f" Use [code]-p[/code] in [code]dstack attach[/code] to override the local" + f" port mapping, e.g. [code]-p {e.port + 1}:{e.port}[/code].[/]" + ) + exit(1) + if attached: for entry in run.logs(): sys.stdout.buffer.write(entry) sys.stdout.buffer.flush() diff --git a/src/dstack/_internal/core/services/ssh/ports.py b/src/dstack/_internal/core/services/ssh/ports.py index f0716e6158..1d41bcd2c6 100644 --- a/src/dstack/_internal/core/services/ssh/ports.py +++ b/src/dstack/_internal/core/services/ssh/ports.py @@ -11,7 +11,9 @@ class PortUsedError(DstackError): - pass + def __init__(self, port: int): + self.port = port + super().__init__(f"Port {port} is already in use") class PortsLock: @@ -28,10 +30,10 @@ def acquire(self) -> "PortsLock": if not local_port: # None or 0 continue if local_port in assigned_ports: - raise PortUsedError(f"Port {local_port} is already in use") + raise PortUsedError(local_port) sock = self._listen(local_port) if sock is None: - raise PortUsedError(f"Port {local_port} is already in use") + raise PortUsedError(local_port) self.sockets[remote_port] = sock assigned_ports.add(local_port) From ab16fefdb1a0a20df08d72c0b2cd4ef7248006fc Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 11 Feb 2026 12:39:18 +0100 Subject: [PATCH 128/187] =?UTF-8?q?Updated=20schema=20generation=20script?= =?UTF-8?q?=20to=20improve=20type=20handling=20and=20user-fr=E2=80=A6=20(#?= =?UTF-8?q?3563)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Updated schema generation script to improve type handling and user-friendly type display * Updated handling of Literal and Enum types for clearer representation. --- docs/docs/reference/dstack.yml/service.md | 12 +- docs/docs/reference/server/config.yml.md | 2 - scripts/docs/gen_schema_reference.py | 232 +++++++++++++++--- .../_internal/core/models/configurations.py | 5 +- 4 files changed, 210 insertions(+), 41 deletions(-) diff --git a/docs/docs/reference/dstack.yml/service.md b/docs/docs/reference/dstack.yml/service.md index 601a88291f..59411a540d 100644 --- a/docs/docs/reference/dstack.yml/service.md +++ b/docs/docs/reference/dstack.yml/service.md @@ -63,7 +63,7 @@ The `service` configuration type allows running [services](../../concepts/servic 1. Doesn't work if your `chat_template` uses `bos_token`. As a workaround, replace `bos_token` inside `chat_template` with the token content itself. 2. Doesn't work if `eos_token` is defined in the model repository as a dictionary. As a workaround, set `eos_token` manually, as shown in the example above (see Chat template). - If you encounter any other issues, please make sure to file a + If you encounter any ofther issues, please make sure to file a [GitHub issue](https://github.com/dstackai/dstack/issues/new/choose). ### `scaling` @@ -127,6 +127,16 @@ The `service` configuration type allows running [services](../../concepts/servic required: true +### `replicas` + +#### `replicas[n]` + +#SCHEMA# dstack._internal.core.models.configurations.ReplicaGroup + overrides: + show_root_heading: false + type: + required: true + ### `retry` #SCHEMA# dstack._internal.core.models.profiles.ProfileRetry diff --git a/docs/docs/reference/server/config.yml.md b/docs/docs/reference/server/config.yml.md index bf1ab46473..26c01d73e2 100644 --- a/docs/docs/reference/server/config.yml.md +++ b/docs/docs/reference/server/config.yml.md @@ -14,8 +14,6 @@ to configure [backends](../../concepts/backends.md) and other [server-level sett #SCHEMA# dstack._internal.server.services.config.ProjectConfig overrides: show_root_heading: false - backends: - type: 'Union[AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, GCPBackendConfigWithCreds, HotAisleBackendConfigWithCreds, LambdaBackendConfigWithCreds, NebiusBackendConfigWithCreds, RunpodBackendConfigWithCreds, VastAIBackendConfigWithCreds, KubernetesConfig]' #### `projects[n].backends` { #backends data-toc-label="backends" } diff --git a/scripts/docs/gen_schema_reference.py b/scripts/docs/gen_schema_reference.py index 8f25283181..1b9b000e28 100644 --- a/scripts/docs/gen_schema_reference.py +++ b/scripts/docs/gen_schema_reference.py @@ -23,24 +23,181 @@ logger.info("Generating schema reference...") -def get_type(annotation: Type) -> str: +def _is_linkable_type(annotation: Any) -> bool: + """Check if a type annotation contains a BaseModel subclass (excluding Range).""" + if inspect.isclass(annotation): + return issubclass(annotation, BaseModel) and not issubclass(annotation, Range) + origin = get_origin(annotation) + if origin is Annotated: + return _is_linkable_type(get_args(annotation)[0]) + if origin is Union: + return any(_is_linkable_type(arg) for arg in get_args(annotation)) + if origin is list: + args = get_args(annotation) + return bool(args) and _is_linkable_type(args[0]) + return False + + +def _type_sort_key(t: str) -> tuple: + """Sort key for type parts: primitives first, then literals, then compound types.""" + order = {"bool": 0, "int": 1, "float": 2, "str": 3} + if t in order: + return (0, order[t]) + if t.startswith('"'): + return (1, t) + if t.startswith("list"): + return (2, t) + if t == "dict": + return (3, "") + if t == "object": + return (4, "") + return (5, t) + + +def get_friendly_type(annotation: Type) -> str: + """Get a user-friendly type string for documentation. + + Produces types like: ``int | str``, ``"rps"``, ``list[object]``, ``"spot" | "on-demand" | "auto"``. + """ + # Unwrap Annotated if get_origin(annotation) is Annotated: - return get_type(get_args(annotation)[0]) + return get_friendly_type(get_args(annotation)[0]) + + # Handle Union (including Optional) if get_origin(annotation) is Union: - # Optional is Union with None. - # We don't want to show Optional[A, None] but just Optional[A] - if annotation.__name__ == "Optional": - args = ",".join(get_type(arg) for arg in get_args(annotation)[:-1]) - else: - args = ",".join(get_type(arg) for arg in get_args(annotation)) - return f"{annotation.__name__}[{args}]" + args = [a for a in get_args(annotation) if a is not type(None)] + if not args: + return "" + parts: list = [] + for arg in args: + friendly = get_friendly_type(arg) + # Split compound types (e.g., "int | str" from Range) to deduplicate, + # but avoid splitting types that contain brackets (e.g., list[...]) + if "[" not in friendly: + for part in friendly.split(" | "): + if part and part not in parts: + parts.append(part) + else: + if friendly and friendly not in parts: + parts.append(friendly) + parts.sort(key=_type_sort_key) + return " | ".join(parts) + + # Handle Literal — show as enum (specific values are in the field description) if get_origin(annotation) is Literal: - return str(annotation).split(".", maxsplit=1)[-1] + return "enum" + + # Handle list if get_origin(annotation) is list: - return f"List[{get_type(get_args(annotation)[0])}]" + args = get_args(annotation) + if args: + inner = get_friendly_type(args[0]) + return f"list[{inner}]" + return "list" + + # Handle dict if get_origin(annotation) is dict: - return f"Dict[{get_type(get_args(annotation)[0])}, {get_type(get_args(annotation)[1])}]" - return annotation.__name__ + return "dict" + + # Handle concrete classes + if inspect.isclass(annotation): + # Enum — list values + if issubclass(annotation, Enum): + values = [e.value for e in annotation] + return " | ".join(f'"{v}"' for v in values) + + # Range — depends on inner type parameter + if issubclass(annotation, Range): + min_field = annotation.__fields__.get("min") + if min_field and inspect.isclass(min_field.type_): + # Range[Memory] → str, Range[int] → int | str + if issubclass(min_field.type_, float): + return "str" + return "int | str" + + # Memory (float subclass that parses "8GB" strings) + from dstack._internal.core.models.resources import Memory as _Memory + + if issubclass(annotation, _Memory): + return "str" + + # BaseModel subclass (not Range) + if issubclass(annotation, BaseModel) and not issubclass(annotation, Range): + # Root models (with __root__ field) — resolve from the root type + if "__root__" in annotation.__fields__: + return get_friendly_type(annotation.__fields__["__root__"].annotation) + # Models with custom __get_validators__ accept primitive input (int, str) + # in addition to the full object form (e.g., GPUSpec, CPUSpec, DiskSpec) + if "__get_validators__" in annotation.__dict__: + return "int | str | object" + return "object" + + # ComputeCapability (tuple subclass that parses "7.5" strings) + if annotation.__name__ == "ComputeCapability": + return "float | str" + + # Constrained and primitive types — check MRO + # bool must come before int (bool is a subclass of int) + if issubclass(annotation, bool): + return "bool" + if issubclass(annotation, int): + # Duration (int subclass that parses "5m" strings) + if annotation.__name__ == "Duration": + return "int | str" + return "int" + if issubclass(annotation, float): + return "float" + if issubclass(annotation, str): + return "str" + if issubclass(annotation, (list, tuple)): + return "list" + if issubclass(annotation, dict): + return "dict" + + return annotation.__name__ + + return str(annotation) + + +_JSON_SCHEMA_TYPE_MAP = { + "string": "str", + "integer": "int", + "number": "float", + "boolean": "bool", + "array": "list", + "object": "object", +} + + +def _enrich_type_from_schema(friendly_type: str, prop_schema: Dict[str, Any]) -> str: + """Enrich the friendly type with extra accepted types from the JSON schema. + + Models may define ``schema_extra`` that adds ``anyOf`` entries for fields + that accept alternative input types (e.g., duration fields typed as ``int`` + but also accepting ``str`` like ``"5m"``). + """ + any_of = prop_schema.get("anyOf") + if not any_of: + return friendly_type + # Only consider string/integer — the most common alternative input types. + # Skip boolean (typically a backward-compat artifact) and object/array. + _ENRICHABLE = {"string": "str", "integer": "int"} + schema_types = set() + for entry in any_of: + mapped = _ENRICHABLE.get(entry.get("type", "")) + if mapped: + schema_types.add(mapped) + # Add any schema types not already present in the friendly type + current_parts = [p.strip() for p in friendly_type.split(" | ")] + new_parts = schema_types - set(current_parts) + if not new_parts: + return friendly_type + all_parts = list(set(current_parts) | new_parts) + # If str is now present, enum is redundant + if "str" in all_parts and "enum" in all_parts: + all_parts.remove("enum") + all_parts.sort(key=_type_sort_key) + return " | ".join(all_parts) def generate_schema_reference( @@ -63,14 +220,21 @@ def generate_schema_reference( "", ] ) + # Get JSON schema to detect extra accepted types from schema_extra + try: + schema_props = cls.schema().get("properties", {}) + except Exception: + schema_props = {} for name, field in cls.__fields__.items(): default = field.default if isinstance(default, Enum): default = default.value + friendly_type = get_friendly_type(field.annotation) + friendly_type = _enrich_type_from_schema(friendly_type, schema_props.get(name, {})) values = dict( name=name, description=field.field_info.description, - type=get_type(field.annotation), + type=friendly_type, default=default, required=field.required, ) @@ -84,11 +248,7 @@ def generate_schema_reference( if field.annotation.__name__ == "Annotated": if field_type.__name__ in ["Optional", "List", "list", "Union"]: field_type = get_args(field_type)[0] - base_model = ( - inspect.isclass(field_type) - and issubclass(field_type, BaseModel) - and not issubclass(field_type, Range) - ) + base_model = _is_linkable_type(field_type) else: base_model = False _defaults = ( @@ -114,29 +274,27 @@ def generate_schema_reference( if not base_model else f"[`{values['name']}`](#{item_id_prefix}{link_name})" ) - item_optional_marker = "(Optional)" if not values["required"] else "" + item_required_marker = "(Required)" if values["required"] else "(Optional)" + item_type_display = f"`{values['type']}`" if values.get("type") else "" item_description = (values["description"]).replace("\n", "
") + "." item_default = _defaults if not values["required"] else _must_be item_id = f"#{values['name']}" if not base_model else f"#_{values['name']}" item_toc_label = f"data-toc-label='{values['name']}'" item_css_cass = "class='reference-item'" - rows.append( - prefix - + " ".join( - [ - f"###### {item_header}", - "-", - item_optional_marker, - item_description, - item_default, - "{", - item_id, - item_toc_label, - item_css_cass, - "}", - ] - ) - ) + parts = [ + f"###### {item_header}", + "-", + item_required_marker, + item_type_display, + item_description, + item_default, + "{", + item_id, + item_toc_label, + item_css_cass, + "}", + ] + rows.append(prefix + " ".join(p for p in parts if p)) return "\n".join(rows) diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 9c8b40b6ec..db965a7697 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -322,7 +322,10 @@ def schema_extra(schema: Dict[str, Any]): class ProbeConfig(generate_dual_core_model(ProbeConfigConfig)): - type: Literal["http"] # expect other probe types in the future, namely `exec` + type: Annotated[ + Literal["http"], + Field(description="The probe type. Must be `http`"), + ] # expect other probe types in the future, namely `exec` url: Annotated[ Optional[str], Field(description=f"The URL to request. Defaults to `{DEFAULT_PROBE_URL}`") ] = None From b166b9ee6a6cbadfe0a52b9cbceee6dc38be12a1 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 11 Feb 2026 12:39:36 +0100 Subject: [PATCH 129/187] Replaced `datacrunch` with `verda` (#3564) --- docs/docs/concepts/backends.md | 2 +- docs/docs/guides/protips.md | 8 ++++---- docs/docs/reference/cli/dstack/offer.md | 8 ++++---- examples/models/wan22/README.md | 16 ++++++++-------- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index 2ed49bda37..eba8501e25 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -853,7 +853,7 @@ Then, go ahead and configure the backend: projects: - name: main backends: - - type: datacrunch + - type: verda creds: type: api_key client_id: xfaHBqYEsArqhKWX-e52x3HH7w8T diff --git a/docs/docs/guides/protips.md b/docs/docs/guides/protips.md index a68d43f3d3..cc31e9f277 100644 --- a/docs/docs/guides/protips.md +++ b/docs/docs/guides/protips.md @@ -439,10 +439,10 @@ Getting offers... ---> 100% # BACKEND REGION INSTANCE TYPE RESOURCES SPOT PRICE - 1 datacrunch FIN-01 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 - 2 datacrunch FIN-02 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 - 3 datacrunch FIN-02 1H100.80S.32V 32xCPU, 185GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 - 4 datacrunch ICE-01 1H100.80S.32V 32xCPU, 185GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 1 verda FIN-01 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 2 verda FIN-02 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 3 verda FIN-02 1H100.80S.32V 32xCPU, 185GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 4 verda ICE-01 1H100.80S.32V 32xCPU, 185GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 5 runpod US-KS-2 NVIDIA H100 PCIe 16xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.39 6 runpod CA NVIDIA H100 80GB HBM3 24xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.69 7 nebius eu-north1 gpu-h100-sxm 16xCPU, 200GB, 1xH100 (80GB), 100.0GB (disk) no $2.95 diff --git a/docs/docs/reference/cli/dstack/offer.md b/docs/docs/reference/cli/dstack/offer.md index ac84308aef..8da816edaa 100644 --- a/docs/docs/reference/cli/dstack/offer.md +++ b/docs/docs/reference/cli/dstack/offer.md @@ -58,10 +58,10 @@ Getting offers... ---> 100% # BACKEND REGION INSTANCE TYPE RESOURCES SPOT PRICE - 1 datacrunch FIN-01 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 - 2 datacrunch FIN-02 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 - 3 datacrunch FIN-02 1H100.80S.32V 32xCPU, 185GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 - 4 datacrunch ICE-01 1H100.80S.32V 32xCPU, 185GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 1 verda FIN-01 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 2 verda FIN-02 1H100.80S.30V 30xCPU, 120GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 3 verda FIN-02 1H100.80S.32V 32xCPU, 185GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 + 4 verda ICE-01 1H100.80S.32V 32xCPU, 185GB, 1xH100 (80GB), 100.0GB (disk) no $2.19 5 runpod US-KS-2 NVIDIA H100 PCIe 16xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.39 6 runpod CA NVIDIA H100 80GB HBM3 24xCPU, 251GB, 1xH100 (80GB), 100.0GB (disk) no $2.69 7 nebius eu-north1 gpu-h100-sxm 16xCPU, 200GB, 1xH100 (80GB), 100.0GB (disk) no $2.95 diff --git a/examples/models/wan22/README.md b/examples/models/wan22/README.md index 07d3b2c1ee..c99856fbf4 100644 --- a/examples/models/wan22/README.md +++ b/examples/models/wan22/README.md @@ -103,10 +103,10 @@ cloud resources and run the configuration. ```shell $ dstack apply -f examples/models/wan22/.dstack.yml - # BACKEND RESOURCES INSTANCE TYPE PRICE - 1 datacrunch (FIN-01) cpu=30 mem=120GB disk=200GB H100:80GB:1 (spot) 1H100.80S.30V $0.99 - 2 datacrunch (FIN-01) cpu=30 mem=120GB disk=200GB H100:80GB:1 (spot) 1H100.80S.30V $0.99 - 3 datacrunch (FIN-02) cpu=44 mem=182GB disk=200GB H200:141GB:1 (spot) 1H200.141S.44V $0.99 + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 verda (FIN-01) cpu=30 mem=120GB disk=200GB H100:80GB:1 (spot) 1H100.80S.30V $0.99 + 2 verda (FIN-01) cpu=30 mem=120GB disk=200GB H100:80GB:1 (spot) 1H100.80S.30V $0.99 + 3 verda (FIN-02) cpu=44 mem=182GB disk=200GB H200:141GB:1 (spot) 1H200.141S.44V $0.99 ---> 100% @@ -125,10 +125,10 @@ If you want you can override the default GPU, spot policy, and even the prompt v $ PROMPT=... $ dstack apply -f examples/models/wan22/.dstack.yml --spot --gpu H100,H200:8 - # BACKEND RESOURCES INSTANCE TYPE PRICE - 1 aws (us-east-2) cpu=192 mem=2048GB disk=300GB H100:80GB:8 (spot) p5.48xlarge $6.963 - 2 datacrunch (FIN-02) cpu=176 mem=1480GB disk=300GB H100:80GB:8 (spot) 8H100.80S.176V $7.93 - 3 datacrunch (ICE-01) cpu=176 mem=1450GB disk=300GB H200:141GB:8 (spot) 8H200.141S.176V $7.96 + # BACKEND RESOURCES INSTANCE TYPE PRICE + 1 aws (us-east-2) cpu=192 mem=2048GB disk=300GB H100:80GB:8 (spot) p5.48xlarge $6.963 + 2 verda (FIN-02) cpu=176 mem=1480GB disk=300GB H100:80GB:8 (spot) 8H100.80S.176V $7.93 + 3 verda (ICE-01) cpu=176 mem=1450GB disk=300GB H200:141GB:8 (spot) 8H200.141S.176V $7.96 ---> 100% From 3c43b251f627fbb7ffc35e56b98166dd9bf68574 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Wed, 11 Feb 2026 12:30:37 +0000 Subject: [PATCH 130/187] Kubernetes: improve offers (#3548) * Skip nodes with untolerated taints * Take into account already allocated resources * Set offer resources to the lower limit of the resource requirements ranges Closes: https://github.com/dstackai/dstack/issues/3481 --- docs/docs/concepts/backends.md | 2 +- .../core/backends/kubernetes/compute.py | 256 +++--------- .../core/backends/kubernetes/resources.py | 363 ++++++++++++++++++ .../{test_compute.py => test_resources.py} | 22 +- 4 files changed, 423 insertions(+), 220 deletions(-) create mode 100644 src/dstack/_internal/core/backends/kubernetes/resources.py rename src/tests/_internal/core/backends/kubernetes/{test_compute.py => test_resources.py} (71%) diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index eba8501e25..da58f36e03 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -1049,7 +1049,7 @@ projects: verbs: ["get", "create"] - apiGroups: [""] resources: ["pods"] - verbs: ["get", "create", "delete"] + verbs: ["get", "create", "delete", "list"] - apiGroups: [""] resources: ["services"] verbs: ["get", "create", "delete"] diff --git a/src/dstack/_internal/core/backends/kubernetes/compute.py b/src/dstack/_internal/core/backends/kubernetes/compute.py index 7f8ef9123f..10b8e5366f 100644 --- a/src/dstack/_internal/core/backends/kubernetes/compute.py +++ b/src/dstack/_internal/core/backends/kubernetes/compute.py @@ -6,7 +6,7 @@ from enum import Enum from typing import List, Optional -from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor +from gpuhunt import AcceleratorVendor from kubernetes import client from dstack._internal.core.backends.base.compute import ( @@ -19,13 +19,32 @@ generate_unique_instance_name_for_job, get_docker_commands, get_dstack_gateway_commands, - normalize_arch, ) -from dstack._internal.core.backends.base.offers import filter_offers_by_requirements from dstack._internal.core.backends.kubernetes.models import ( KubernetesConfig, KubernetesProxyJumpConfig, ) +from dstack._internal.core.backends.kubernetes.resources import ( + AMD_GPU_DEVICE_ID_LABEL_PREFIX, + AMD_GPU_NAME_TO_DEVICE_IDS, + AMD_GPU_NODE_TAINT, + AMD_GPU_RESOURCE, + DUMMY_REGION, + NVIDIA_GPU_NAME_TO_GPU_INFO, + NVIDIA_GPU_NODE_TAINT, + NVIDIA_GPU_PRODUCT_LABEL, + NVIDIA_GPU_RESOURCE, + TaintEffect, + format_memory, + get_amd_gpu_from_node_labels, + get_gpu_request_from_gpu_spec, + get_instance_offer_from_node, + get_instance_offers, + get_node_labels, + get_nvidia_gpu_from_node_labels, + is_hard_taint, + is_taint_tolerated, +) from dstack._internal.core.backends.kubernetes.utils import ( call_api_method, get_api_from_config_data, @@ -33,58 +52,27 @@ ) from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT from dstack._internal.core.errors import ComputeError -from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.gateways import ( GatewayComputeConfiguration, GatewayProvisioningData, ) from dstack._internal.core.models.instances import ( - Disk, Gpu, - InstanceAvailability, InstanceOfferWithAvailability, - InstanceRuntime, - InstanceType, - Resources, SSHConnectionParams, ) from dstack._internal.core.models.placement import PlacementGroup -from dstack._internal.core.models.resources import CPUSpec, GPUSpec, Memory +from dstack._internal.core.models.resources import CPUSpec, GPUSpec from dstack._internal.core.models.routers import AnyRouterConfig from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run from dstack._internal.core.models.volumes import Volume -from dstack._internal.utils.common import get_or_error, parse_memory +from dstack._internal.utils.common import get_or_error from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) JUMP_POD_IMAGE = "testcontainers/sshd:1.3.0@sha256:c50c0f59554dcdb2d9e5e705112144428ae9d04ac0af6322b365a18e24213a6a" JUMP_POD_SSH_PORT = 22 -DUMMY_REGION = "-" - -NVIDIA_GPU_RESOURCE = "nvidia.com/gpu" -NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE -NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product" - -AMD_GPU_RESOURCE = "amd.com/gpu" -AMD_GPU_NODE_TAINT = AMD_GPU_RESOURCE -# The oldest but still supported label format, the safest option, see the commit message: -# https://github.com/ROCm/k8s-device-plugin/commit/c0b0231b391a56bc9da4f362d561e25e960d7a48 -# E.g., beta.amd.com/gpu.device-id.74b5=4 - A node with four MI300X VF (0x74b5) GPUs -# We cannot rely on the beta.amd.com/gpu.product-name.* label, as it may be missing, see the issue: -# https://github.com/ROCm/k8s-device-plugin/issues/112 -AMD_GPU_DEVICE_ID_LABEL_PREFIX = f"beta.{AMD_GPU_RESOURCE}.device-id." - -# Taints we know and tolerate when creating our objects, e.g., the jump pod. -TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT, AMD_GPU_NODE_TAINT) - -NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS} -NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys() - -AMD_GPU_DEVICE_ID_TO_GPU_INFO = { - device_id: gpu_info for gpu_info in KNOWN_AMD_GPUS for device_id in gpu_info.device_ids -} -AMD_GPU_NAME_TO_DEVICE_IDS = {gpu.name: gpu.device_ids for gpu in KNOWN_AMD_GPUS} class Operator(str, Enum): @@ -92,12 +80,6 @@ class Operator(str, Enum): IN = "In" -class TaintEffect(str, Enum): - NO_EXECUTE = "NoExecute" - NO_SCHEDULE = "NoSchedule" - PREFER_NO_SCHEDULE = "PreferNoSchedule" - - class KubernetesCompute( ComputeWithFilteredOffersCached, ComputeWithPrivilegedSupport, @@ -117,16 +99,7 @@ def __init__(self, config: KubernetesConfig): def get_offers_by_requirements( self, requirements: Requirements ) -> list[InstanceOfferWithAvailability]: - gpu_request = 0 - if (gpu_spec := requirements.resources.gpu) is not None: - gpu_request = _get_gpu_request_from_gpu_spec(gpu_spec) - instance_offers: list[InstanceOfferWithAvailability] = [] - for node in self.api.list_node().items: - if (instance_offer := _get_instance_offer_from_node(node, gpu_request)) is not None: - instance_offers.extend( - filter_offers_by_requirements([instance_offer], requirements) - ) - return instance_offers + return get_instance_offers(self.api, requirements) def run_job( self, @@ -191,7 +164,7 @@ def run_job( if (cpu_max := resources_spec.cpu.count.max) is not None: resources_limits["cpu"] = str(cpu_max) if (gpu_spec := resources_spec.gpu) is not None: - if (gpu_request := _get_gpu_request_from_gpu_spec(gpu_spec)) > 0: + if (gpu_request := get_gpu_request_from_gpu_spec(gpu_spec)) > 0: gpu_resource, node_affinity, node_taint = _get_pod_spec_parameters_for_gpu( self.api, gpu_spec ) @@ -208,14 +181,14 @@ def run_job( ) ) if (memory_min := resources_spec.memory.min) is not None: - resources_requests["memory"] = _render_memory(memory_min) + resources_requests["memory"] = format_memory(memory_min) if (memory_max := resources_spec.memory.max) is not None: - resources_limits["memory"] = _render_memory(memory_max) + resources_limits["memory"] = format_memory(memory_max) if (disk_spec := resources_spec.disk) is not None: if (disk_min := disk_spec.size.min) is not None: - resources_requests["ephemeral-storage"] = _render_memory(disk_min) + resources_requests["ephemeral-storage"] = format_memory(disk_min) if (disk_max := disk_spec.size.max) is not None: - resources_limits["ephemeral-storage"] = _render_memory(disk_max) + resources_limits["ephemeral-storage"] = format_memory(disk_max) if (shm_size := resources_spec.shm_size) is not None: shm_volume_name = "dev-shm" volumes_.append( @@ -223,7 +196,7 @@ def run_job( name=shm_volume_name, empty_dir=client.V1EmptyDirVolumeSource( medium="Memory", - size_limit=_render_memory(shm_size), + size_limit=format_memory(shm_size), ), ) ) @@ -338,10 +311,17 @@ def update_provisioning_data( provisioning_data.hostname = get_or_error(service_spec.cluster_ip) pod_spec = get_or_error(pod.spec) node = self.api.read_node(name=get_or_error(pod_spec.node_name)) - # The original offer has a list of GPUs already sliced according to pod spec's GPU resource - # request, which is inferred from dstack's GPUSpec, see _get_gpu_request_from_gpu_spec - gpu_request = len(provisioning_data.instance_type.resources.gpus) - if (instance_offer := _get_instance_offer_from_node(node, gpu_request)) is not None: + # In the original offer, the resources have already been adjusted according to + # the run configuration resource requirements, see get_offers_by_requirements() + original_resources = provisioning_data.instance_type.resources + instance_offer = get_instance_offer_from_node( + node=node, + cpu_request=original_resources.cpus, + memory_mib_request=original_resources.memory_mib, + gpu_request=len(original_resources.gpus), + disk_mib_request=original_resources.disk.size_mib, + ) + if instance_offer is not None: provisioning_data.instance_type = instance_offer.instance provisioning_data.region = instance_offer.region provisioning_data.price = instance_offer.price @@ -481,146 +461,6 @@ def terminate_gateway( ) -def _get_gpu_request_from_gpu_spec(gpu_spec: GPUSpec) -> int: - return gpu_spec.count.min or 0 - - -def _get_instance_offer_from_node( - node: client.V1Node, gpu_request: int -) -> Optional[InstanceOfferWithAvailability]: - try: - node_name = get_or_error(get_or_error(node.metadata).name) - node_status = get_or_error(node.status) - allocatable = get_or_error(node_status.allocatable) - _cpu_arch: Optional[str] = None - if node_status.node_info is not None: - _cpu_arch = node_status.node_info.architecture - cpu_arch = normalize_arch(_cpu_arch).to_cpu_architecture() - cpus = _parse_cpu(allocatable["cpu"]) - memory_mib = _parse_memory(allocatable["memory"]) - disk_size_mib = _parse_memory(allocatable["ephemeral-storage"]) - gpus = _get_node_gpus(node) - except (ValueError, KeyError) as e: - logger.exception("Failed to process node: %s: %s", type(e).__name__, e) - return None - return InstanceOfferWithAvailability( - backend=BackendType.KUBERNETES, - instance=InstanceType( - name=node_name, - resources=Resources( - cpus=cpus, - cpu_arch=cpu_arch, - memory_mib=memory_mib, - gpus=gpus[:gpu_request], - spot=False, - disk=Disk(size_mib=disk_size_mib), - ), - ), - price=0, - region=DUMMY_REGION, - availability=InstanceAvailability.AVAILABLE, - instance_runtime=InstanceRuntime.RUNNER, - ) - - -def _parse_cpu(cpu: str) -> int: - if cpu.endswith("m"): - # "m" means millicpu (1/1000 CPU), e.g., 7900m -> 7.9 -> 7 - return int(float(cpu[:-1]) / 1000) - return int(cpu) - - -def _parse_memory(memory: str) -> int: - if memory.isdigit(): - # no suffix means that the value is in bytes - return int(memory) // 2**20 - return int(parse_memory(memory, as_untis="M")) - - -def _render_memory(memory: Memory) -> str: - return f"{float(memory)}Gi" - - -def _get_node_labels(node: client.V1Node) -> dict[str, str]: - if (metadata := node.metadata) is None: - return {} - if (labels := metadata.labels) is None: - return {} - return labels - - -def _get_node_gpus(node: client.V1Node) -> list[Gpu]: - node_name = get_or_error(get_or_error(node.metadata).name) - allocatable = get_or_error(get_or_error(node.status).allocatable) - labels = _get_node_labels(node) - for gpu_resource, gpu_getter in ( - (NVIDIA_GPU_RESOURCE, _get_nvidia_gpu_from_node_labels), - (AMD_GPU_RESOURCE, _get_amd_gpu_from_node_labels), - ): - _gpu_count = allocatable.get(gpu_resource) - if not _gpu_count: - continue - gpu_count = int(_gpu_count) - if gpu_count < 1: - continue - gpu = gpu_getter(labels) - if gpu is None: - logger.warning( - "Node %s: GPU resource found, but failed to detect its model: %s=%d", - node_name, - gpu_resource, - gpu_count, - ) - return [] - return [gpu] * gpu_count - logger.debug("Node %s: no GPU resource found", node_name) - return [] - - -def _get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]: - # We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery - # to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or - # "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB". - # Thus, we convert the product name to a known gpu name. - gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL) - if gpu_product is None: - return None - gpu_product = gpu_product.replace("RTX-", "RTX") - for gpu_name in NVIDIA_GPU_NAMES: - if gpu_name.lower() in gpu_product.lower().split("-"): - break - else: - return None - gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name] - gpu_memory = gpu_info.memory * 1024 - # A100 may come in two variants - if "40GB" in gpu_product: - gpu_memory = 40 * 1024 - return Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory) - - -def _get_amd_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]: - # (AMDGPUInfo.name, AMDGPUInfo.memory) pairs - gpus: set[tuple[str, int]] = set() - for label in labels: - if not label.startswith(AMD_GPU_DEVICE_ID_LABEL_PREFIX): - continue - _, _, _device_id = label.rpartition(".") - device_id = int(_device_id, 16) - gpu_info = AMD_GPU_DEVICE_ID_TO_GPU_INFO.get(device_id) - if gpu_info is None: - logger.warning("Unknown AMD GPU device id: %X", device_id) - continue - gpus.add((gpu_info.name, gpu_info.memory)) - if not gpus: - return None - if len(gpus) == 1: - gpu_name, gpu_memory_gib = next(iter(gpus)) - return Gpu(vendor=AcceleratorVendor.AMD, name=gpu_name, memory_mib=gpu_memory_gib * 1024) - logger.warning("Multiple AMD GPU models detected: %s, ignoring all GPUs", gpus) - return None - - def _get_pod_spec_parameters_for_gpu( api: client.CoreV1Api, gpu_spec: GPUSpec ) -> tuple[str, client.V1NodeAffinity, str]: @@ -643,8 +483,8 @@ def _get_nvidia_gpu_node_affinity( ) -> client.V1NodeAffinity: matching_gpu_label_values: set[str] = set() for node in nodes: - labels = _get_node_labels(node) - gpu = _get_nvidia_gpu_from_node_labels(labels) + labels = get_node_labels(node) + gpu = get_nvidia_gpu_from_node_labels(labels) if gpu is not None and _gpu_matches_gpu_spec(gpu, gpu_spec): matching_gpu_label_values.add(labels[NVIDIA_GPU_PRODUCT_LABEL]) if not matching_gpu_label_values: @@ -676,8 +516,8 @@ def _get_amd_gpu_node_affinity( ) -> client.V1NodeAffinity: matching_device_ids: set[int] = set() for node in nodes: - labels = _get_node_labels(node) - gpu = _get_amd_gpu_from_node_labels(labels) + labels = get_node_labels(node) + gpu = get_amd_gpu_from_node_labels(labels) if gpu is not None and _gpu_matches_gpu_spec(gpu, gpu_spec): matching_device_ids.update(AMD_GPU_NAME_TO_DEVICE_IDS[gpu.name]) return client.V1NodeAffinity( @@ -828,10 +668,10 @@ def _create_jump_pod_service( taints = node_spec.taints or [] for taint in taints: # A "soft" taint, ignore. - if taint.effect == TaintEffect.PREFER_NO_SCHEDULE: + if not is_hard_taint(taint): continue has_hard_taint = True - if taint.key in TOLERATED_NODE_TAINTS: + if is_taint_tolerated(taint): tolerated_taints.add((taint.key, taint.effect)) if not has_hard_taint: toleration_required = False diff --git a/src/dstack/_internal/core/backends/kubernetes/resources.py b/src/dstack/_internal/core/backends/kubernetes/resources.py new file mode 100644 index 0000000000..018ff5fb62 --- /dev/null +++ b/src/dstack/_internal/core/backends/kubernetes/resources.py @@ -0,0 +1,363 @@ +import dataclasses +from collections.abc import Mapping +from decimal import Decimal +from enum import Enum +from typing import Callable, Optional, Union, cast + +from gpuhunt import KNOWN_AMD_GPUS, KNOWN_NVIDIA_GPUS, AcceleratorVendor + +# XXX: kubernetes.utils is missing in the stubs package +from kubernetes import utils as _kubernetes_utils # pyright: ignore[reportAttributeAccessIssue] +from kubernetes.client import CoreV1Api, V1Node, V1Taint +from typing_extensions import Self + +from dstack._internal.core.backends.base.compute import normalize_arch +from dstack._internal.core.backends.base.offers import filter_offers_by_requirements +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + Disk, + Gpu, + InstanceAvailability, + InstanceOfferWithAvailability, + InstanceRuntime, + InstanceType, + Resources, +) +from dstack._internal.core.models.resources import CPUSpec, GPUSpec, Memory +from dstack._internal.core.models.runs import Requirements +from dstack._internal.utils.common import get_or_error +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +DUMMY_REGION = "-" + +NVIDIA_GPU_RESOURCE = "nvidia.com/gpu" +NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE +NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product" + +AMD_GPU_RESOURCE = "amd.com/gpu" +AMD_GPU_NODE_TAINT = AMD_GPU_RESOURCE +# The oldest but still supported label format, the safest option, see the commit message: +# https://github.com/ROCm/k8s-device-plugin/commit/c0b0231b391a56bc9da4f362d561e25e960d7a48 +# E.g., beta.amd.com/gpu.device-id.74b5=4 - A node with four MI300X VF (0x74b5) GPUs +# We cannot rely on the beta.amd.com/gpu.product-name.* label, as it may be missing, see the issue: +# https://github.com/ROCm/k8s-device-plugin/issues/112 +AMD_GPU_DEVICE_ID_LABEL_PREFIX = f"beta.{AMD_GPU_RESOURCE}.device-id." + +NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS} +NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys() + +AMD_GPU_DEVICE_ID_TO_GPU_INFO = { + device_id: gpu_info for gpu_info in KNOWN_AMD_GPUS for device_id in gpu_info.device_ids +} +AMD_GPU_NAME_TO_DEVICE_IDS = {gpu.name: gpu.device_ids for gpu in KNOWN_AMD_GPUS} + + +class PodPhase(str, Enum): + PENDING = "Pending" + RUNNING = "Running" + SUCCEEDED = "Succeeded" + FAILED = "Failed" + UNKNOWN = "Unknown" # Deprecated: It isn't being set since 2015 + + @classmethod + def finished_statuses(cls) -> list["PodPhase"]: + return [cls.SUCCEEDED, cls.FAILED] + + def is_finished(self): + return self in self.finished_statuses() + + +class TaintEffect(str, Enum): + NO_EXECUTE = "NoExecute" + NO_SCHEDULE = "NoSchedule" + PREFER_NO_SCHEDULE = "PreferNoSchedule" + + +class KubernetesResource(str, Enum): + CPU = "cpu" + MEMORY = "memory" + EPHEMERAL_STORAGE = "ephemeral-storage" + NVIDIA_GPU = NVIDIA_GPU_RESOURCE + AMD_GPU = AMD_GPU_RESOURCE + + +@dataclasses.dataclass +class KubernetesResources: + cpu: Decimal = Decimal("0") + memory: Decimal = Decimal("0") + ephemeral_storage: Decimal = Decimal("0") + nvidia_gpu: Decimal = Decimal("0") + amd_gpu: Decimal = Decimal("0") + + @classmethod + def from_kubernetes_map(cls, map_: Mapping[str, str]) -> Self: + dct: dict[str, Decimal] = {} + for resource in KubernetesResource: + if (qty := map_.get(resource.value)) is not None: + dct[resource.name.lower()] = parse_quantity(qty) + return cls(**dct) + + def __getitem__(self, key: str) -> Decimal: + try: + resource = KubernetesResource(key) + except ValueError: + raise KeyError(key) + return getattr(self, resource.name.lower()) + + def __add__(self, other: Self) -> Self: + dct: dict[str, Decimal] = dataclasses.asdict(self) + qty: Decimal + for field, qty in dataclasses.asdict(other).items(): + dct[field] += qty + return type(self)(**dct) + + def __sub__(self, other: Self) -> Self: + dct: dict[str, Decimal] = dataclasses.asdict(self) + qty: Decimal + for field, qty in dataclasses.asdict(other).items(): + dct[field] -= qty + return type(self)(**dct) + + +parse_quantity = cast( + Callable[[Union[str, int, float, Decimal]], Decimal], _kubernetes_utils.parse_quantity +) + + +def format_memory(memory: Memory) -> str: + return f"{float(memory)}Gi" + + +def get_gpu_request_from_gpu_spec(gpu_spec: GPUSpec) -> int: + return gpu_spec.count.min or 0 + + +def get_node_name(node: V1Node) -> Optional[str]: + if (metadata := node.metadata) is None: + return None + return metadata.name + + +def get_node_labels(node: V1Node) -> dict[str, str]: + if (metadata := node.metadata) is None: + return {} + if (labels := metadata.labels) is None: + return {} + return labels + + +def is_hard_taint(taint: V1Taint) -> bool: + if taint.effect == TaintEffect.PREFER_NO_SCHEDULE: + return False + if taint.effect not in TaintEffect: + logger.warning( + "Unexpected taint %s=%s effect: %s", taint.key, taint.value or "", taint.effect + ) + return True + + +def is_taint_tolerated(taint: V1Taint) -> bool: + return taint.key in (NVIDIA_GPU_NODE_TAINT, AMD_GPU_NODE_TAINT) + + +def get_instance_offers( + api: CoreV1Api, requirements: Requirements +) -> list[InstanceOfferWithAvailability]: + resources_spec = requirements.resources + assert isinstance(resources_spec.cpu, CPUSpec) + cpu_request = resources_spec.cpu.count.min or 0 + memory_mib_request = round((resources_spec.memory.min or 0) * 1024) + gpu_request = 0 + if (gpu_spec := resources_spec.gpu) is not None: + gpu_request = get_gpu_request_from_gpu_spec(gpu_spec) + disk_mib_request = 0 + if (disk_spec := resources_spec.disk) is not None: + disk_mib_request = round((disk_spec.size.min or 0) * 1024) + + nodes_allocated_resources = _get_nodes_allocated_resources(api) + offers: list[InstanceOfferWithAvailability] = [] + for node in api.list_node().items: + if (node_name := get_node_name(node)) is None: + continue + offer = _get_instance_offer_from_node( + node=node, + node_name=node_name, + node_allocated_resources=nodes_allocated_resources.get(node_name), + cpu_request=cpu_request, + memory_mib_request=memory_mib_request, + gpu_request=gpu_request, + disk_mib_request=disk_mib_request, + ) + if offer is not None: + offers.extend(filter_offers_by_requirements([offer], requirements)) + return offers + + +def get_instance_offer_from_node( + node: V1Node, + *, + cpu_request: int, + memory_mib_request: int, + gpu_request: int, + disk_mib_request: int, +) -> Optional[InstanceOfferWithAvailability]: + node_name = get_node_name(node) + if node_name is None: + return None + return _get_instance_offer_from_node( + node=node, + node_name=node_name, + node_allocated_resources=None, + cpu_request=cpu_request, + memory_mib_request=memory_mib_request, + gpu_request=gpu_request, + disk_mib_request=disk_mib_request, + ) + + +def get_nvidia_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]: + # We rely on https://github.com/NVIDIA/k8s-device-plugin/tree/main/docs/gpu-feature-discovery + # to detect gpus. Note that "nvidia.com/gpu.product" is not a short gpu name like "T4" or + # "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB". + # Thus, we convert the product name to a known gpu name. + gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL) + if gpu_product is None: + return None + gpu_product = gpu_product.replace("RTX-", "RTX") + for gpu_name in NVIDIA_GPU_NAMES: + if gpu_name.lower() in gpu_product.lower().split("-"): + break + else: + return None + gpu_info = NVIDIA_GPU_NAME_TO_GPU_INFO[gpu_name] + gpu_memory = gpu_info.memory * 1024 + # A100 may come in two variants + if "40GB" in gpu_product: + gpu_memory = 40 * 1024 + return Gpu(vendor=AcceleratorVendor.NVIDIA, name=gpu_name, memory_mib=gpu_memory) + + +def get_amd_gpu_from_node_labels(labels: dict[str, str]) -> Optional[Gpu]: + # (AMDGPUInfo.name, AMDGPUInfo.memory) pairs + gpus: set[tuple[str, int]] = set() + for label in labels: + if not label.startswith(AMD_GPU_DEVICE_ID_LABEL_PREFIX): + continue + _, _, _device_id = label.rpartition(".") + device_id = int(_device_id, 16) + gpu_info = AMD_GPU_DEVICE_ID_TO_GPU_INFO.get(device_id) + if gpu_info is None: + logger.warning("Unknown AMD GPU device id: %X", device_id) + continue + gpus.add((gpu_info.name, gpu_info.memory)) + if not gpus: + return None + if len(gpus) == 1: + gpu_name, gpu_memory_gib = next(iter(gpus)) + return Gpu(vendor=AcceleratorVendor.AMD, name=gpu_name, memory_mib=gpu_memory_gib * 1024) + logger.warning("Multiple AMD GPU models detected: %s, ignoring all GPUs", gpus) + return None + + +def _get_instance_offer_from_node( + node: V1Node, + node_name: str, + node_allocated_resources: Optional[KubernetesResources], + cpu_request: int, + memory_mib_request: int, + gpu_request: int, + disk_mib_request: int, +) -> Optional[InstanceOfferWithAvailability]: + try: + node_spec = get_or_error(node.spec) + if any(is_hard_taint(t) and not is_taint_tolerated(t) for t in node_spec.taints or []): + logger.debug("Node %s: untolerated taint(s) found, skipping", node_name) + return None + node_status = get_or_error(node.status) + allocatable = get_or_error(node_status.allocatable) + _cpu_arch: Optional[str] = None + if node_status.node_info is not None: + _cpu_arch = node_status.node_info.architecture + cpu_arch = normalize_arch(_cpu_arch).to_cpu_architecture() + except ValueError as e: + logger.exception("Failed to process node %s: %s: %s", node_name, type(e).__name__, e) + return None + + node_resources = KubernetesResources.from_kubernetes_map(allocatable) + if node_allocated_resources is not None: + node_resources = node_resources - node_allocated_resources + cpu = max(0, int(node_resources.cpu)) + memory_mib = max(0, int(node_resources.memory / 2**20)) + disk_mib = max(0, int(node_resources.ephemeral_storage / 2**20)) + gpus = _get_gpus_from_node(node, node_name, node_resources) + + return InstanceOfferWithAvailability( + backend=BackendType.KUBERNETES, + instance=InstanceType( + name=node_name, + resources=Resources( + cpus=min(cpu_request, cpu), + cpu_arch=cpu_arch, + memory_mib=min(memory_mib_request, memory_mib), + gpus=gpus[:gpu_request], + disk=Disk(size_mib=min(disk_mib_request, disk_mib)), + spot=False, + ), + ), + price=0, + region=DUMMY_REGION, + availability=InstanceAvailability.AVAILABLE, + instance_runtime=InstanceRuntime.RUNNER, + ) + + +def _get_gpus_from_node( + node: V1Node, node_name: str, node_resources: KubernetesResources +) -> list[Gpu]: + labels = get_node_labels(node) + for gpu_resource, gpu_getter in ( + (NVIDIA_GPU_RESOURCE, get_nvidia_gpu_from_node_labels), + (AMD_GPU_RESOURCE, get_amd_gpu_from_node_labels), + ): + gpu_count = int(node_resources[gpu_resource]) + if gpu_count < 1: + continue + gpu = gpu_getter(labels) + if gpu is None: + logger.warning( + "Node %s: GPU resource found, but failed to detect its model: %s=%d", + node_name, + gpu_resource, + gpu_count, + ) + return [] + return [gpu] * gpu_count + logger.debug("Node %s: no available GPU resource found", node_name) + return [] + + +def _get_nodes_allocated_resources(api: CoreV1Api) -> dict[str, KubernetesResources]: + nodes_allocated_resources: dict[str, KubernetesResources] = {} + for pod in api.list_pod_for_all_namespaces().items: + pod_status = get_or_error(pod.status) + pod_phase = PodPhase(get_or_error(pod_status.phase)) + if pod_phase.is_finished(): + continue + pod_spec = get_or_error(pod.spec) + node_name = pod_spec.node_name + if node_name is None: + continue + pod_requests = KubernetesResources() + # TODO: Should we also check PodSpec.resources? As of 2026-01-21, it's in alpha + for container in pod_spec.containers: + if container.resources is not None and container.resources.requests: + pod_requests += KubernetesResources.from_kubernetes_map( + container.resources.requests + ) + try: + nodes_allocated_resources[node_name] += pod_requests + except KeyError: + nodes_allocated_resources[node_name] = pod_requests + return nodes_allocated_resources diff --git a/src/tests/_internal/core/backends/kubernetes/test_compute.py b/src/tests/_internal/core/backends/kubernetes/test_resources.py similarity index 71% rename from src/tests/_internal/core/backends/kubernetes/test_compute.py rename to src/tests/_internal/core/backends/kubernetes/test_resources.py index f0bcbbc174..6a74233a53 100644 --- a/src/tests/_internal/core/backends/kubernetes/test_compute.py +++ b/src/tests/_internal/core/backends/kubernetes/test_resources.py @@ -3,51 +3,51 @@ import pytest from gpuhunt import AcceleratorVendor -from dstack._internal.core.backends.kubernetes.compute import ( - _get_amd_gpu_from_node_labels, - _get_nvidia_gpu_from_node_labels, +from dstack._internal.core.backends.kubernetes.resources import ( + get_amd_gpu_from_node_labels, + get_nvidia_gpu_from_node_labels, ) from dstack._internal.core.models.instances import Gpu class TestGetNvidiaGPUFromNodeLabels: def test_returns_none_if_no_labels(self): - assert _get_nvidia_gpu_from_node_labels({}) is None + assert get_nvidia_gpu_from_node_labels({}) is None def test_returns_correct_memory_for_different_A100(self): - assert _get_nvidia_gpu_from_node_labels( + assert get_nvidia_gpu_from_node_labels( {"nvidia.com/gpu.product": "A100-SXM4-40GB"} ) == Gpu(vendor=AcceleratorVendor.NVIDIA, name="A100", memory_mib=40 * 1024) - assert _get_nvidia_gpu_from_node_labels( + assert get_nvidia_gpu_from_node_labels( {"nvidia.com/gpu.product": "A100-SXM4-80GB"} ) == Gpu(vendor=AcceleratorVendor.NVIDIA, name="A100", memory_mib=80 * 1024) class TestGetAMDGPUFromNodeLabels: def test_returns_no_gpus_if_no_labels(self): - assert _get_amd_gpu_from_node_labels({}) is None + assert get_amd_gpu_from_node_labels({}) is None def test_returns_known_gpu(self): - assert _get_amd_gpu_from_node_labels({"beta.amd.com/gpu.device-id.74b5": "4"}) == Gpu( + assert get_amd_gpu_from_node_labels({"beta.amd.com/gpu.device-id.74b5": "4"}) == Gpu( vendor=AcceleratorVendor.AMD, name="MI300X", memory_mib=192 * 1024 ) def test_returns_known_gpu_if_multiple_device_ids_match_the_same_gpu(self): # 4x AMD Instinct MI300X VF + 4x AMD Instinct MI300X labels = {"beta.amd.com/gpu.device-id.74b5": "4", "beta.amd.com/gpu.device-id.74a1": "4"} - assert _get_amd_gpu_from_node_labels(labels) == Gpu( + assert get_amd_gpu_from_node_labels(labels) == Gpu( vendor=AcceleratorVendor.AMD, name="MI300X", memory_mib=192 * 1024 ) def test_returns_none_if_device_id_is_unknown(self, caplog: pytest.LogCaptureFixture): caplog.set_level(logging.WARNING) - assert _get_amd_gpu_from_node_labels({"beta.amd.com/gpu.device-id.ffff": "4"}) is None + assert get_amd_gpu_from_node_labels({"beta.amd.com/gpu.device-id.ffff": "4"}) is None assert "Unknown AMD GPU device id: FFFF" in caplog.text def test_returns_none_if_multiple_gpu_models(self, caplog: pytest.LogCaptureFixture): caplog.set_level(logging.WARNING) # 4x AMD Instinct MI300X VF + 4x AMD Instinct MI325X labels = {"beta.amd.com/gpu.device-id.74b5": "4", "beta.amd.com/gpu.device-id.74a5": "4"} - assert _get_amd_gpu_from_node_labels(labels) is None + assert get_amd_gpu_from_node_labels(labels) is None assert "Multiple AMD GPU models detected" in caplog.text From 0155a2846bc338112acff02780ba244511737b0a Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Wed, 11 Feb 2026 14:28:11 +0100 Subject: [PATCH 131/187] Updated reference schema generation - enumerating values instead of "str" or "enum" where possible --- scripts/docs/gen_schema_reference.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/docs/gen_schema_reference.py b/scripts/docs/gen_schema_reference.py index 1b9b000e28..01514b34d3 100644 --- a/scripts/docs/gen_schema_reference.py +++ b/scripts/docs/gen_schema_reference.py @@ -57,7 +57,7 @@ def _type_sort_key(t: str) -> tuple: def get_friendly_type(annotation: Type) -> str: """Get a user-friendly type string for documentation. - Produces types like: ``int | str``, ``"rps"``, ``list[object]``, ``"spot" | "on-demand" | "auto"``. + Produces types like: ``int | str``, ``"vscode" | "cursor"``, ``list[object]``. """ # Unwrap Annotated if get_origin(annotation) is Annotated: @@ -83,9 +83,10 @@ def get_friendly_type(annotation: Type) -> str: parts.sort(key=_type_sort_key) return " | ".join(parts) - # Handle Literal — show as enum (specific values are in the field description) + # Handle Literal — list values if get_origin(annotation) is Literal: - return "enum" + values = get_args(annotation) + return " | ".join(f'"{v}"' for v in values) # Handle list if get_origin(annotation) is list: @@ -184,6 +185,9 @@ def _enrich_type_from_schema(friendly_type: str, prop_schema: Dict[str, Any]) -> _ENRICHABLE = {"string": "str", "integer": "int"} schema_types = set() for entry in any_of: + # Skip entries with enum constraints — those are already captured as literal values + if "enum" in entry: + continue mapped = _ENRICHABLE.get(entry.get("type", "")) if mapped: schema_types.add(mapped) @@ -193,9 +197,9 @@ def _enrich_type_from_schema(friendly_type: str, prop_schema: Dict[str, Any]) -> if not new_parts: return friendly_type all_parts = list(set(current_parts) | new_parts) - # If str is now present, enum is redundant - if "str" in all_parts and "enum" in all_parts: - all_parts.remove("enum") + # If str is now present, single-value literals are redundant + if "str" in all_parts: + all_parts = [p for p in all_parts if not p.startswith('"') or p in all_parts] all_parts.sort(key=_type_sort_key) return " | ".join(all_parts) From 2f1512e049d3c0060ee8fa19b85a5779d2866592 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 12 Feb 2026 07:05:00 +0000 Subject: [PATCH 132/187] Streamline InstanceModel.remote_connection_info handling (#3566) --- .../background/tasks/process_instances.py | 9 ++++--- .../background/tasks/process_running_jobs.py | 24 +++++++++---------- .../_internal/server/services/fleets.py | 8 +++---- .../_internal/server/services/instances.py | 8 +++++-- .../_internal/server/services/proxy/repo.py | 12 +++++----- src/dstack/_internal/server/services/ssh.py | 12 +++++----- 6 files changed, 38 insertions(+), 35 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index 3cb53322a7..184287b31a 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -2,7 +2,7 @@ import datetime import logging from datetime import timedelta -from typing import Any, Dict, Optional, cast +from typing import Any, Dict, Optional import gpuhunt import requests @@ -86,8 +86,10 @@ get_instance_configuration, get_instance_profile, get_instance_provisioning_data, + get_instance_remote_connection_info, get_instance_requirements, get_instance_ssh_private_keys, + is_ssh_instance, remove_dangling_tasks_from_instance, switch_instance_status, ) @@ -244,7 +246,7 @@ async def _process_instance(session: AsyncSession, instance: InstanceModel): instance = res.unique().scalar_one() if instance.status == InstanceStatus.PENDING: - if instance.remote_connection_info is not None: + if is_ssh_instance(instance): await _add_remote(session, instance) else: await _create_instance( @@ -323,7 +325,8 @@ async def _add_remote(session: AsyncSession, instance: InstanceModel) -> None: return try: - remote_details = RemoteConnectionInfo.parse_raw(cast(str, instance.remote_connection_info)) + remote_details = get_instance_remote_connection_info(instance) + assert remote_details is not None # Prepare connection key try: pkeys = _ssh_keys_to_pkeys(remote_details.ssh_keys) diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py index bcb35a0898..7275106ceb 100644 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ b/src/dstack/_internal/server/background/tasks/process_running_jobs.py @@ -18,7 +18,6 @@ from dstack._internal.core.models.files import FileArchiveMapping from dstack._internal.core.models.instances import ( InstanceStatus, - RemoteConnectionInfo, SSHConnectionParams, ) from dstack._internal.core.models.metrics import Metric @@ -54,7 +53,10 @@ from dstack._internal.server.services import events, services from dstack._internal.server.services import files as files_services from dstack._internal.server.services import logs as logs_services -from dstack._internal.server.services.instances import get_instance_ssh_private_keys +from dstack._internal.server.services.instances import ( + get_instance_remote_connection_info, + get_instance_ssh_private_keys, +) from dstack._internal.server.services.jobs import ( find_job, get_job_attached_volumes, @@ -870,14 +872,11 @@ async def _maybe_register_replica( ssh_head_proxy: Optional[SSHConnectionParams] = None ssh_head_proxy_private_key: Optional[str] = None instance = common_utils.get_or_error(job_model.instance) - if instance.remote_connection_info is not None: - rci: RemoteConnectionInfo = RemoteConnectionInfo.__response__.parse_raw( - instance.remote_connection_info - ) - if rci.ssh_proxy is not None: - ssh_head_proxy = rci.ssh_proxy - ssh_head_proxy_keys = common_utils.get_or_error(rci.ssh_proxy_keys) - ssh_head_proxy_private_key = ssh_head_proxy_keys[0].private + rci = get_instance_remote_connection_info(instance) + if rci is not None and rci.ssh_proxy is not None: + ssh_head_proxy = rci.ssh_proxy + ssh_head_proxy_keys = common_utils.get_or_error(rci.ssh_proxy_keys) + ssh_head_proxy_private_key = ssh_head_proxy_keys[0].private try: await services.register_replica( session, @@ -1090,9 +1089,8 @@ def _submit_job_to_runner( None if repo_credentials is None else repo_credentials.clone_url, ) instance = job_model.instance - if instance is not None and instance.remote_connection_info is not None: - remote_info = RemoteConnectionInfo.__response__.parse_raw(instance.remote_connection_info) - instance_env = remote_info.env + if instance is not None and (rci := get_instance_remote_connection_info(instance)) is not None: + instance_env = rci.env else: instance_env = None diff --git a/src/dstack/_internal/server/services/fleets.py b/src/dstack/_internal/server/services/fleets.py index 9b877475f7..8febbd126b 100644 --- a/src/dstack/_internal/server/services/fleets.py +++ b/src/dstack/_internal/server/services/fleets.py @@ -2,7 +2,7 @@ from collections.abc import Callable from datetime import datetime from functools import wraps -from typing import List, Literal, Optional, Tuple, TypeVar, Union, cast +from typing import List, Literal, Optional, Tuple, TypeVar, Union from sqlalchemy import and_, func, or_, select from sqlalchemy.ext.asyncio import AsyncSession @@ -32,7 +32,6 @@ InstanceOfferWithAvailability, InstanceStatus, InstanceTerminationReason, - RemoteConnectionInfo, SSHConnectionParams, SSHKey, ) @@ -1106,9 +1105,8 @@ async def _check_ssh_hosts_not_yet_added( # ignore instances belonging to the same fleet -- in-place update/recreate if current_fleet_id is not None and instance.fleet_id == current_fleet_id: continue - instance_conn_info = RemoteConnectionInfo.parse_raw( - cast(str, instance.remote_connection_info) - ) + instance_conn_info = get_instance_remote_connection_info(instance) + assert instance_conn_info is not None existing_hosts.add(instance_conn_info.host) instances_already_in_fleet = [] diff --git a/src/dstack/_internal/server/services/instances.py b/src/dstack/_internal/server/services/instances.py index c311df7db7..8506ad2731 100644 --- a/src/dstack/_internal/server/services/instances.py +++ b/src/dstack/_internal/server/services/instances.py @@ -286,6 +286,10 @@ def get_instance_requirements(instance_model: InstanceModel) -> Requirements: return Requirements.__response__.parse_raw(instance_model.requirements) +def is_ssh_instance(instance_model: InstanceModel) -> bool: + return instance_model.remote_connection_info is not None + + def get_instance_remote_connection_info( instance_model: InstanceModel, ) -> Optional[RemoteConnectionInfo]: @@ -299,11 +303,11 @@ def get_instance_ssh_private_keys(instance_model: InstanceModel) -> tuple[str, O Returns a pair of SSH private keys: host key and optional proxy jump key. """ host_private_key = instance_model.project.ssh_private_key - if instance_model.remote_connection_info is None: + rci = get_instance_remote_connection_info(instance_model) + if rci is None: # Cloud instance return host_private_key, None # SSH instance - rci = RemoteConnectionInfo.__response__.parse_raw(instance_model.remote_connection_info) if rci.ssh_proxy is None: return host_private_key, None if rci.ssh_proxy_keys is None: diff --git a/src/dstack/_internal/server/services/proxy/repo.py b/src/dstack/_internal/server/services/proxy/repo.py index f8c8d882c8..7f1564fe62 100644 --- a/src/dstack/_internal/server/services/proxy/repo.py +++ b/src/dstack/_internal/server/services/proxy/repo.py @@ -9,7 +9,7 @@ from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.configurations import ServiceConfiguration -from dstack._internal.core.models.instances import RemoteConnectionInfo, SSHConnectionParams +from dstack._internal.core.models.instances import SSHConnectionParams from dstack._internal.core.models.runs import ( JobProvisioningData, JobSpec, @@ -31,6 +31,7 @@ ) from dstack._internal.proxy.lib.repo import BaseProxyRepo from dstack._internal.server.models import JobModel, ProjectModel, RunModel +from dstack._internal.server.services.instances import get_instance_remote_connection_info from dstack._internal.server.settings import DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE from dstack._internal.utils.common import get_or_error @@ -97,11 +98,10 @@ async def get_service(self, project_name: str, run_name: str) -> Optional[Servic ssh_head_proxy: Optional[SSHConnectionParams] = None ssh_head_proxy_private_key: Optional[str] = None instance = get_or_error(job.instance) - if instance.remote_connection_info is not None: - rci = RemoteConnectionInfo.__response__.parse_raw(instance.remote_connection_info) - if rci.ssh_proxy is not None: - ssh_head_proxy = rci.ssh_proxy - ssh_head_proxy_private_key = get_or_error(rci.ssh_proxy_keys)[0].private + rci = get_instance_remote_connection_info(instance) + if rci is not None and rci.ssh_proxy is not None: + ssh_head_proxy = rci.ssh_proxy + ssh_head_proxy_private_key = get_or_error(rci.ssh_proxy_keys)[0].private job_spec: JobSpec = JobSpec.__response__.parse_raw(job.job_spec_data) replica = Replica( id=job.id.hex, diff --git a/src/dstack/_internal/server/services/ssh.py b/src/dstack/_internal/server/services/ssh.py index d1ba8ffc83..0fa7c189e2 100644 --- a/src/dstack/_internal/server/services/ssh.py +++ b/src/dstack/_internal/server/services/ssh.py @@ -4,10 +4,11 @@ import dstack._internal.server.services.jobs as jobs_services from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.core.models.instances import RemoteConnectionInfo, SSHConnectionParams +from dstack._internal.core.models.instances import SSHConnectionParams from dstack._internal.core.models.runs import JobProvisioningData from dstack._internal.core.services.ssh.tunnel import SSH_DEFAULT_OPTIONS, SocketPair, SSHTunnel from dstack._internal.server.models import JobModel +from dstack._internal.server.services.instances import get_instance_remote_connection_info from dstack._internal.utils.common import get_or_error from dstack._internal.utils.path import FileContent @@ -46,11 +47,10 @@ def container_ssh_tunnel( ssh_head_proxy: Optional[SSHConnectionParams] = None ssh_head_proxy_private_key: Optional[str] = None instance = get_or_error(job.instance) - if instance.remote_connection_info is not None: - rci = RemoteConnectionInfo.__response__.parse_raw(instance.remote_connection_info) - if rci.ssh_proxy is not None: - ssh_head_proxy = rci.ssh_proxy - ssh_head_proxy_private_key = get_or_error(rci.ssh_proxy_keys)[0].private + rci = get_instance_remote_connection_info(instance) + if rci is not None and rci.ssh_proxy is not None: + ssh_head_proxy = rci.ssh_proxy + ssh_head_proxy_private_key = get_or_error(rci.ssh_proxy_keys)[0].private ssh_proxies = [] if ssh_head_proxy is not None: ssh_head_proxy_private_key = get_or_error(ssh_head_proxy_private_key) From 7729127656d70a9b13ea60d04885f68b9e657fdc Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 12 Feb 2026 07:27:10 +0000 Subject: [PATCH 133/187] Kubernetes: rework jump pod provisioning (#3561) With these changes, run_job() only creates a jump pod, and it is provisioned in update_provisioning_data() as follows: - check if the pod is running, try again later if not - collect all cluster external IPs, prefer the pod's node IP, fall back to a random IP if the pod's node has no external IP - connect to the jump pod, add the user's public SSH key This patch also fixes a bug introduced in https://github.com/dstackai/dstack/pull/3273 where it was not possible to add another user's public key due to `ForceCommand /bin/false`. Fixes: https://github.com/dstackai/dstack/issues/3559 --- .../core/backends/kubernetes/compute.py | 360 +++++++++--------- .../core/backends/kubernetes/resources.py | 9 +- .../core/backends/kubernetes/utils.py | 29 -- 3 files changed, 194 insertions(+), 204 deletions(-) diff --git a/src/dstack/_internal/core/backends/kubernetes/compute.py b/src/dstack/_internal/core/backends/kubernetes/compute.py index 10b8e5366f..51abddc70c 100644 --- a/src/dstack/_internal/core/backends/kubernetes/compute.py +++ b/src/dstack/_internal/core/backends/kubernetes/compute.py @@ -1,13 +1,14 @@ +import random import shlex import subprocess import tempfile -import threading import time from enum import Enum from typing import List, Optional from gpuhunt import AcceleratorVendor from kubernetes import client +from typing_extensions import Self from dstack._internal.core.backends.base.compute import ( Compute, @@ -34,6 +35,7 @@ NVIDIA_GPU_NODE_TAINT, NVIDIA_GPU_PRODUCT_LABEL, NVIDIA_GPU_RESOURCE, + PodPhase, TaintEffect, format_memory, get_amd_gpu_from_node_labels, @@ -41,6 +43,7 @@ get_instance_offer_from_node, get_instance_offers, get_node_labels, + get_node_name, get_nvidia_gpu_from_node_labels, is_hard_taint, is_taint_tolerated, @@ -48,10 +51,10 @@ from dstack._internal.core.backends.kubernetes.utils import ( call_api_method, get_api_from_config_data, - get_cluster_public_ip, ) from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT -from dstack._internal.core.errors import ComputeError +from dstack._internal.core.errors import ComputeError, ProvisioningError +from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.gateways import ( GatewayComputeConfiguration, GatewayProvisioningData, @@ -73,6 +76,7 @@ JUMP_POD_IMAGE = "testcontainers/sshd:1.3.0@sha256:c50c0f59554dcdb2d9e5e705112144428ae9d04ac0af6322b365a18e24213a6a" JUMP_POD_SSH_PORT = 22 +JUMP_POD_USER = "root" class Operator(str, Enum): @@ -80,6 +84,16 @@ class Operator(str, Enum): IN = "In" +class KubernetesBackendData(CoreModel): + jump_pod_name: str + jump_pod_service_name: str + user_ssh_public_key: str + + @classmethod + def load(cls, raw: str) -> Self: + return cls.__response__.parse_raw(raw) + + class KubernetesCompute( ComputeWithFilteredOffersCached, ComputeWithPrivilegedSupport, @@ -116,39 +130,19 @@ def run_job( commands = get_docker_commands( [run.run_spec.ssh_key_pub.strip(), project_ssh_public_key.strip()] ) - # Before running a job, ensure a jump pod service is running. # There is a one jump pod per Kubernetes backend that is used # as an ssh proxy jump to connect to all other services in Kubernetes. - # Setup jump pod in a separate thread to avoid long-running run_job. - # In case the thread fails, the job will be failed and resubmitted. - jump_pod_hostname = self.proxy_jump.hostname - if jump_pod_hostname is None: - jump_pod_hostname = get_cluster_public_ip(self.api) - if jump_pod_hostname is None: - raise ComputeError( - "Failed to acquire an IP for jump pod automatically. " - "Specify ssh_host for Kubernetes backend." - ) - jump_pod_port, created = _create_jump_pod_service_if_not_exists( + # The service is created here and configured later in update_provisioning_data() + jump_pod_name = f"dstack-{run.project_name}-ssh-jump-pod" + jump_pod_service_name = _get_pod_service_name(jump_pod_name) + _create_jump_pod_service_if_not_exists( api=self.api, namespace=self.config.namespace, - project_name=run.project_name, - ssh_public_keys=[project_ssh_public_key.strip(), run.run_spec.ssh_key_pub.strip()], + jump_pod_name=jump_pod_name, + jump_pod_service_name=jump_pod_service_name, jump_pod_port=self.proxy_jump.port, + project_ssh_public_key=project_ssh_public_key.strip(), ) - if not created: - threading.Thread( - target=_continue_setup_jump_pod, - kwargs={ - "api": self.api, - "namespace": self.config.namespace, - "project_name": run.project_name, - "project_ssh_private_key": project_ssh_private_key.strip(), - "user_ssh_public_key": run.run_spec.ssh_key_pub.strip(), - "jump_pod_host": jump_pod_hostname, - "jump_pod_port": jump_pod_port, - }, - ).start() resources_requests: dict[str, str] = {} resources_limits: dict[str, str] = {} @@ -264,27 +258,32 @@ def run_job( ), ), ) + + backend_data = KubernetesBackendData( + jump_pod_name=jump_pod_name, + jump_pod_service_name=jump_pod_service_name, + user_ssh_public_key=run.run_spec.ssh_key_pub.strip(), + ) return JobProvisioningData( backend=instance_offer.backend, - instance_type=instance_offer.instance, instance_id=instance_name, - # Although we can already get Service's ClusterIP from the `V1Service` object returned - # by the `create_namespaced_service` method, we still need 1) updated instance offer - # 2) PodIP for multinode runs. - # We'll update all these fields once the pod is assigned to the node. - hostname=None, - internal_ip=None, region=instance_offer.region, price=instance_offer.price, username="root", ssh_port=DSTACK_RUNNER_SSH_PORT, dockerized=False, - ssh_proxy=SSHConnectionParams( - hostname=jump_pod_hostname, - username="root", - port=jump_pod_port, - ), - backend_data=None, + # Although we can already get Service's ClusterIP from the `V1Service` object returned + # by the `create_namespaced_service` method, we still need: + # - updated instance offer + # - job pod's PodIP for multinode runs + # - jump pod node's ExternalIP and jump pod service's NodePort for ssh_proxy + # We'll update all these fields once both the jump pod and the job pod are assigned + # to the nodes. + hostname=None, + instance_type=instance_offer.instance, + internal_ip=None, + ssh_proxy=None, + backend_data=backend_data.json(), ) def update_provisioning_data( @@ -293,6 +292,26 @@ def update_provisioning_data( project_ssh_public_key: str, project_ssh_private_key: str, ): + if provisioning_data.backend_data is not None: + # Before running a job, ensure the jump pod is running and has user's public SSH key. + backend_data = KubernetesBackendData.load(provisioning_data.backend_data) + ssh_proxy = _check_and_configure_jump_pod_service( + api=self.api, + namespace=self.config.namespace, + jump_pod_name=backend_data.jump_pod_name, + jump_pod_service_name=backend_data.jump_pod_service_name, + jump_pod_hostname=self.proxy_jump.hostname, + project_ssh_private_key=project_ssh_private_key, + user_ssh_public_key=backend_data.user_ssh_public_key, + ) + if ssh_proxy is None: + # Jump pod is not ready yet + return + provisioning_data.ssh_proxy = ssh_proxy + # Remove backend data to save space in DB and skip this step + # in case update_provisioning_data() is called again. + provisioning_data.backend_data = None + pod = self.api.read_namespaced_pod( name=provisioning_data.instance_id, namespace=self.config.namespace, @@ -560,36 +579,14 @@ def _gpu_matches_gpu_spec(gpu: Gpu, gpu_spec: GPUSpec) -> bool: return True -def _continue_setup_jump_pod( - api: client.CoreV1Api, - namespace: str, - project_name: str, - project_ssh_private_key: str, - user_ssh_public_key: str, - jump_pod_host: str, - jump_pod_port: int, -): - _wait_for_pod_ready( - api=api, - namespace=namespace, - pod_name=_get_jump_pod_name(project_name), - ) - _add_authorized_key_to_jump_pod( - jump_pod_host=jump_pod_host, - jump_pod_port=jump_pod_port, - ssh_private_key=project_ssh_private_key, - ssh_authorized_key=user_ssh_public_key, - ) - - def _create_jump_pod_service_if_not_exists( api: client.CoreV1Api, namespace: str, - project_name: str, - ssh_public_keys: list[str], + jump_pod_name: str, + jump_pod_service_name: str, jump_pod_port: Optional[int], -) -> tuple[int, bool]: - created = False + project_ssh_public_key: str, +) -> None: service: Optional[client.V1Service] = None pod: Optional[client.V1Pod] = None _namespace = call_api_method( @@ -609,52 +606,27 @@ def _create_jump_pod_service_if_not_exists( service = call_api_method( api.read_namespaced_service, expected=404, - name=_get_jump_pod_service_name(project_name), + name=jump_pod_service_name, namespace=namespace, ) pod = call_api_method( api.read_namespaced_pod, expected=404, - name=_get_jump_pod_name(project_name), + name=jump_pod_name, namespace=namespace, ) + # The service may exist without the pod if the node on which the jump pod was running # has been deleted. - if service is None or pod is None: - service = _create_jump_pod_service( - api=api, - namespace=namespace, - project_name=project_name, - ssh_public_keys=ssh_public_keys, - jump_pod_port=jump_pod_port, - ) - created = True - port: Optional[int] = None - if service.spec is not None and service.spec.ports: - port = service.spec.ports[0].node_port - if port is None: - raise ComputeError( - f"Failed to get NodePort of jump pod Service for project '{project_name}'" - ) - return port, created - + if service is not None and pod is not None: + return -def _create_jump_pod_service( - api: client.CoreV1Api, - namespace: str, - project_name: str, - ssh_public_keys: list[str], - jump_pod_port: Optional[int], -) -> client.V1Service: - # TODO use restricted ssh-forwarding-only user for jump pod instead of root. - pod_name = _get_jump_pod_name(project_name) call_api_method( api.delete_namespaced_pod, expected=404, namespace=namespace, - name=pod_name, + name=jump_pod_name, ) - # False if we found at least one node without any "hard" taint, that is, if we don't need to # specify the toleration. toleration_required = True @@ -684,17 +656,16 @@ def _create_jump_pod_service( ) if not tolerations: logger.warning("No appropriate node found, the jump pod may never be scheduled") - - commands = _get_jump_pod_commands(authorized_keys=ssh_public_keys) + commands = _get_jump_pod_commands(authorized_keys=[project_ssh_public_key]) pod = client.V1Pod( metadata=client.V1ObjectMeta( - name=pod_name, - labels={"app.kubernetes.io/name": pod_name}, + name=jump_pod_name, + labels={"app.kubernetes.io/name": jump_pod_name}, ), spec=client.V1PodSpec( containers=[ client.V1Container( - name=f"{pod_name}-container", + name=f"{jump_pod_name}-container", image=JUMP_POD_IMAGE, command=["/bin/sh"], args=["-c", " && ".join(commands)], @@ -712,18 +683,17 @@ def _create_jump_pod_service( namespace=namespace, body=pod, ) - service_name = _get_jump_pod_service_name(project_name) call_api_method( api.delete_namespaced_service, expected=404, namespace=namespace, - name=service_name, + name=jump_pod_service_name, ) service = client.V1Service( - metadata=client.V1ObjectMeta(name=service_name), + metadata=client.V1ObjectMeta(name=jump_pod_service_name), spec=client.V1ServiceSpec( type="NodePort", - selector={"app.kubernetes.io/name": pod_name}, + selector={"app.kubernetes.io/name": jump_pod_name}, ports=[ client.V1ServicePort( port=JUMP_POD_SSH_PORT, @@ -733,12 +703,110 @@ def _create_jump_pod_service( ], ), ) - return api.create_namespaced_service( + api.create_namespaced_service( namespace=namespace, body=service, ) +def _check_and_configure_jump_pod_service( + api: client.CoreV1Api, + namespace: str, + jump_pod_name: str, + jump_pod_service_name: str, + jump_pod_hostname: Optional[str], + project_ssh_private_key: str, + user_ssh_public_key: str, +) -> Optional[SSHConnectionParams]: + jump_pod = api.read_namespaced_pod( + namespace=namespace, + name=jump_pod_name, + ) + jump_pod_phase = PodPhase(get_or_error(get_or_error(jump_pod.status).phase)) + if jump_pod_phase.is_finished(): + raise ProvisioningError(f"Jump pod {jump_pod_name} is unexpectedly finished") + if not jump_pod_phase.is_running(): + logger.debug("Jump pod %s is not running yet", jump_pod_name) + return None + + if jump_pod_hostname is None: + jump_pod_node_name = get_or_error(get_or_error(jump_pod.spec).node_name) + cluster_external_ips: list[str] = [] + for node in api.list_node().items: + node_external_ips = [ + node_address.address + for node_address in get_or_error(get_or_error(node.status).addresses) + if node_address.type == "ExternalIP" + ] + if node_external_ips: + if get_node_name(node) == jump_pod_node_name: + jump_pod_hostname = node_external_ips[0] + break + cluster_external_ips.extend(node_external_ips) + if jump_pod_hostname is None: + if not cluster_external_ips: + raise ProvisioningError( + "Failed to acquire an IP for jump pod automatically." + " Specify proxy_jump.hostname for Kubernetes backend." + ) + jump_pod_hostname = random.choice(cluster_external_ips) + logger.info( + ( + "Jump pod %s is running on node %s which has no external IP," + " picking a random external IP: %s" + ), + jump_pod_name, + jump_pod_node_name, + jump_pod_hostname, + ) + + jump_pod_service = api.read_namespaced_service( + name=jump_pod_service_name, + namespace=namespace, + ) + jump_pod_service_ports = get_or_error(jump_pod_service.spec).ports + if not jump_pod_service_ports: + raise ProvisioningError("Jump pod service %s ports are empty", jump_pod_service_name) + if (jump_pod_port := jump_pod_service_ports[0].node_port) is None: + raise ProvisioningError("Jump pod service %s port is not set", jump_pod_service_name) + + ssh_exit_status, ssh_output = _run_ssh_command( + hostname=jump_pod_hostname, + port=jump_pod_port, + username=JUMP_POD_USER, + ssh_private_key=project_ssh_private_key, + # command= in authorized_keys is equivalent to ForceCommand in sshd_config + # By forcing the /bin/false command we only allow proxy jumping, no shell access + command=f""" + if grep -qvF '{user_ssh_public_key}' ~/.ssh/authorized_keys; then + echo 'command="/bin/false" {user_ssh_public_key}' >> ~/.ssh/authorized_keys + fi + """, + ) + if ssh_exit_status != 0: + logger.debug( + "Jump pod %s @ %s:%d, SSH command failed, exit status: %d, output: %s", + jump_pod_name, + jump_pod_hostname, + jump_pod_port, + ssh_exit_status, + ssh_output, + ) + return None + + logger.debug( + "Jump pod %s is available @ %s:%d", + jump_pod_name, + jump_pod_hostname, + jump_pod_port, + ) + return SSHConnectionParams( + hostname=jump_pod_hostname, + port=jump_pod_port, + username=JUMP_POD_USER, + ) + + def _get_jump_pod_commands(authorized_keys: list[str]) -> list[str]: authorized_keys_content = "\n".join(authorized_keys).strip() commands = [ @@ -755,40 +823,11 @@ def _get_jump_pod_commands(authorized_keys: list[str]) -> list[str]: " -o LogLevel=ERROR" " -o PasswordAuthentication=no" " -o AllowTcpForwarding=local" - # proxy jumping only, no shell access - " -o ForceCommand=/bin/false" ), ] return commands -def _wait_for_pod_ready( - api: client.CoreV1Api, - namespace: str, - pod_name: str, - timeout_seconds: int = 300, -): - start_time = time.time() - while True: - pod = call_api_method( - api.read_namespaced_pod, - expected=404, - name=pod_name, - namespace=namespace, - ) - if pod is not None: - pod_status = get_or_error(pod.status) - phase = get_or_error(pod_status.phase) - container_statuses = get_or_error(pod_status.container_statuses) - if phase == "Running" and all(status.ready for status in container_statuses): - return True - elapsed_time = time.time() - start_time - if elapsed_time >= timeout_seconds: - logger.warning("Timeout waiting for pod %s to be ready", pod_name) - return False - time.sleep(1) - - def _wait_for_load_balancer_address( api: client.CoreV1Api, namespace: str, @@ -824,24 +863,6 @@ def _wait_for_load_balancer_address( time.sleep(1) -def _add_authorized_key_to_jump_pod( - jump_pod_host: str, - jump_pod_port: int, - ssh_private_key: str, - ssh_authorized_key: str, -): - _run_ssh_command( - hostname=jump_pod_host, - port=jump_pod_port, - ssh_private_key=ssh_private_key, - command=( - f'if grep -qvF "{ssh_authorized_key}" ~/.ssh/authorized_keys; then ' - f"echo {ssh_authorized_key} >> ~/.ssh/authorized_keys; " - "fi" - ), - ) - - def _get_gateway_commands( authorized_keys: List[str], router: Optional[AnyRouterConfig] = None ) -> List[str]: @@ -882,35 +903,34 @@ def _get_gateway_commands( return commands -def _run_ssh_command(hostname: str, port: int, ssh_private_key: str, command: str): +def _run_ssh_command( + hostname: str, port: int, username: str, ssh_private_key: str, command: str +) -> tuple[int, bytes]: with tempfile.NamedTemporaryFile("w+", 0o600) as f: f.write(ssh_private_key) f.flush() - subprocess.run( + proc = subprocess.run( [ "ssh", "-F", "none", "-o", "StrictHostKeyChecking=no", + "-o", + # The same timeout as in core.services.ssh.tunnel.SSH_DEFAULT_OPTIONS, + # which is used, for example, by server.services.runner.ssh.runner_ssh_tunnel() + "ConnectTimeout=3", "-i", f.name, "-p", str(port), - f"root@{hostname}", + f"{username}@{hostname}", command, ], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, ) - - -def _get_jump_pod_name(project_name: str) -> str: - return f"dstack-{project_name}-ssh-jump-pod" - - -def _get_jump_pod_service_name(project_name: str) -> str: - return f"dstack-{project_name}-ssh-jump-pod-service" + return proc.returncode, proc.stdout def _get_pod_service_name(pod_name: str) -> str: diff --git a/src/dstack/_internal/core/backends/kubernetes/resources.py b/src/dstack/_internal/core/backends/kubernetes/resources.py index 018ff5fb62..d5cb1739f4 100644 --- a/src/dstack/_internal/core/backends/kubernetes/resources.py +++ b/src/dstack/_internal/core/backends/kubernetes/resources.py @@ -61,12 +61,11 @@ class PodPhase(str, Enum): FAILED = "Failed" UNKNOWN = "Unknown" # Deprecated: It isn't being set since 2015 - @classmethod - def finished_statuses(cls) -> list["PodPhase"]: - return [cls.SUCCEEDED, cls.FAILED] - def is_finished(self): - return self in self.finished_statuses() + return self in [self.SUCCEEDED, self.FAILED] + + def is_running(self): + return self == self.RUNNING class TaintEffect(str, Enum): diff --git a/src/dstack/_internal/core/backends/kubernetes/utils.py b/src/dstack/_internal/core/backends/kubernetes/utils.py index 78213b6178..fb7816b572 100644 --- a/src/dstack/_internal/core/backends/kubernetes/utils.py +++ b/src/dstack/_internal/core/backends/kubernetes/utils.py @@ -9,8 +9,6 @@ ) from typing_extensions import ParamSpec -from dstack._internal.utils.common import get_or_error - T = TypeVar("T") P = ParamSpec("P") @@ -52,30 +50,3 @@ def call_api_method( if e.status not in expected: raise return None - - -def get_cluster_public_ip(api: CoreV1Api) -> Optional[str]: - """ - Returns public IP of any cluster node. - """ - public_ips = get_cluster_public_ips(api) - if len(public_ips) == 0: - return None - return public_ips[0] - - -def get_cluster_public_ips(api: CoreV1Api) -> list[str]: - """ - Returns public IPs of all cluster nodes. - """ - public_ips = [] - for node in api.list_node().items: - node_status = get_or_error(node.status) - addresses = get_or_error(node_status.addresses) - - # Look for an external IP address - for address in addresses: - if address.type == "ExternalIP": - public_ips.append(address.address) - - return public_ips From c4ed6ca49ae87bff5fd1a3fadd97d1e28dcaa15c Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 12 Feb 2026 09:43:20 +0000 Subject: [PATCH 134/187] Don't terminate unreachable SSH instances (#3568) Fixes: https://github.com/dstackai/dstack/issues/2531 --- .../background/tasks/process_instances.py | 4 +-- .../tasks/test_process_instances.py | 29 +++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py index 184287b31a..da47cf16ed 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/tasks/process_instances.py @@ -778,7 +778,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non ) return - if instance.termination_deadline is None: + if not is_ssh_instance(instance) and instance.termination_deadline is None: instance.termination_deadline = get_current_datetime() + TERMINATION_DEADLINE_OFFSET if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None: @@ -792,7 +792,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non switch_instance_status(session, instance, InstanceStatus.TERMINATING) elif instance.status.is_available(): deadline = instance.termination_deadline - if get_current_datetime() > deadline: + if deadline is not None and get_current_datetime() > deadline: instance.termination_reason = InstanceTerminationReason.UNREACHABLE switch_instance_status(session, instance, InstanceStatus.TERMINATING) diff --git a/src/tests/_internal/server/background/tasks/test_process_instances.py b/src/tests/_internal/server/background/tasks/test_process_instances.py index 8691f3e7e6..8d94ee059b 100644 --- a/src/tests/_internal/server/background/tasks/test_process_instances.py +++ b/src/tests/_internal/server/background/tasks/test_process_instances.py @@ -198,6 +198,7 @@ async def test_check_shim_start_termination_deadline(self, test_db, session: Asy session=session, project=project, status=InstanceStatus.IDLE, + unreachable=False, ) health_status = "SSH connection fail" with patch( @@ -210,11 +211,39 @@ async def test_check_shim_start_termination_deadline(self, test_db, session: Asy assert instance is not None assert instance.status == InstanceStatus.IDLE + assert instance.unreachable assert instance.termination_deadline is not None assert instance.termination_deadline.replace( tzinfo=dt.timezone.utc ) > get_current_datetime() + dt.timedelta(minutes=19) + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_check_shim_does_not_start_termination_deadline_with_ssh_instance( + self, test_db, session: AsyncSession + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + unreachable=False, + remote_connection_info=get_remote_connection_info(), + ) + health_status = "SSH connection fail" + with patch( + "dstack._internal.server.background.tasks.process_instances._check_instance_inner" + ) as healthcheck: + healthcheck.return_value = InstanceCheck(reachable=False, message=health_status) + await process_instances() + + await session.refresh(instance) + + assert instance is not None + assert instance.status == InstanceStatus.IDLE + assert instance.unreachable + assert instance.termination_deadline is None + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_check_shim_stop_termination_deadline(self, test_db, session: AsyncSession): From 107fefef52821c544abe1755d9379fe9ed54a96c Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:52:56 +0100 Subject: [PATCH 135/187] [Docs] Nebius example under `Clusters` (#3567) * 1. Added `Nebius` example under `Clusters` 2. Minor updates (for consistency) to `Lambda` and `Crusoe` * Update examples/clusters/crusoe/README.md Co-authored-by: Dmitry Meyer * PR review --------- Co-authored-by: Dmitry Meyer --- docs/examples.md | 10 + docs/examples/clusters/nebius/index.md | 0 examples/clusters/crusoe/README.md | 48 ++--- examples/clusters/lambda/README.md | 34 ++-- examples/clusters/nebius/README.md | 257 +++++++++++++++++++++++++ mkdocs.yml | 1 + 6 files changed, 310 insertions(+), 40 deletions(-) create mode 100644 docs/examples/clusters/nebius/index.md create mode 100644 examples/clusters/nebius/README.md diff --git a/docs/examples.md b/docs/examples.md index 575c747ca2..e57a41cf52 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -122,6 +122,16 @@ hide: Set up Crusoe clusters with optimized networking

+ +

+ Nebius +

+ +

+ Set up Nebius clusters with optimized networking +

+

diff --git a/docs/examples/clusters/nebius/index.md b/docs/examples/clusters/nebius/index.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/clusters/crusoe/README.md b/examples/clusters/crusoe/README.md index b34c4aef34..50ec88e461 100644 --- a/examples/clusters/crusoe/README.md +++ b/examples/clusters/crusoe/README.md @@ -1,24 +1,25 @@ --- title: Crusoe -description: Setting up Crusoe clusters using Managed Kubernetes or VMs with InfiniBand support +description: Using Crusoe clusters with InfiniBand support via Kubernetes or VMs --- # Crusoe -Crusoe offers two ways to use clusters with fast interconnect: +`dstack` allows using Crusoe clusters with fast interconnect via two ways: -* [Crusoe Managed Kubernetes](#kubernetes) – Lets you interact with clusters through the Kubernetes API and includes support for NVIDIA and AMD GPU operators and related tools. -* [Virtual Machines (VMs)](#vms) – Gives you direct access to clusters in the form of virtual machines with NVIDIA and AMD GPUs. +* [Kubernetes](#kubernetes) – If you create a Kubernetes cluster on Crusoe and configure a `kubernetes` backend and create a backend fleet in `dstack`, `dstack` lets you fully use this cluster through `dstack`. +* [VMs](#vms) – If you create a VM cluster on Crusoe and create an SSH fleet in `dstack`, `dstack` lets you fully use this cluster through `dstack`. + +## Kubernetes -Both options use the same underlying networking infrastructure. This example walks you through how to set up Crusoe clusters to use with `dstack`. +### Create a cluster -## Crusoe Managed Kubernetes { #kubernetes } +1. Go `Networking` → `Firewall Rules`, click `Create Firewall Rule`, and allow ingress traffic on port `30022`. This port will be used by the `dstack` server to access the jump host. +2. Go to `Orchestration` and click `Create Cluster`. Make sure to enable the `NVIDIA GPU Operator` add-on. +3. Go the the cluster, and click `Create Node Pool`. Select the right type of the instance, and `Desired Number of Nodes`. +4. Wait until nodes are provisioned. -!!! info "Prerequsisites" - 1. Go `Networking` → `Firewall Rules`, click `Create Firewall Rule`, and allow ingress traffic on port `30022`. This port will be used by the `dstack` server to access the jump host. - 2. Go to `Orchestration` and click `Create Cluster`. Make sure to enable the `NVIDIA GPU Operator` add-on. - 3. Go the the cluster, and click `Create Node Pool`. Select the right type of the instance. If you intend to auto-scale the cluster, make sure to set `Desired Number of Nodes` at least to `1`, since `dstack` doesn't currently support clusters that scale down to `0` nodes. - 4. Wait until at least one node is running. +> Even if you enable `autoscaling`, `dstack` can use only the nodes that are already provisioned. ### Configure the backend @@ -56,7 +57,7 @@ backends: [kubernetes] resources: # Specify requirements to filter nodes - gpu: 1..8 + gpu: 8 ``` @@ -75,12 +76,13 @@ Once the fleet is created, you can run [dev environments](https://dstack.ai/docs ## VMs -Another way to work with Crusoe clusters is through VMs. While `dstack` typically supports VM-based compute providers via [dedicated backends](https://dstack.ai/docs/concepts/backends#vm-based) that automate provisioning, Crusoe does not yet have [such a backend](https://github.com/dstackai/dstack/issues/3378). As a result, to use a VM-based Crusoe cluster with `dstack`, you should use [SSH fleets](https://dstack.ai/docs/concepts/fleets). +Another way to work with Crusoe clusters is through VMs. While `dstack` typically supports VM-based compute providers via [dedicated backends](https://dstack.ai/docs/concepts/backends#vm-based) that automate provisioning, Crusoe does not yet have [such a backend](https://github.com/dstackai/dstack/issues/3378). As a result, to use a VM-based Crusoe cluster with `dstack`, you should use [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh-fleets). -!!! info "Prerequsisites" - 1. Go to `Compute`, then `Instances`, and click `Create Instance`. Make sure to select the right instance type and VM image (that [support interconnect](https://docs.crusoecloud.com/networking/infiniband/managing-infiniband-networks/index.html)). Make sure to create as many instances as needed. +### Create instances -### Create a fleet +1. Go to `Compute`, then `Instances`, and click `Create Instance`. Make sure to select the right instance type and VM image (that [support interconnect](https://docs.crusoecloud.com/networking/infiniband/managing-infiniband-networks/index.html)). Make sure to create as many instances as needed. + +### Create a `dstack` fleet Follow the standard instructions for setting up an [SSH fleet](https://dstack.ai/docs/concepts/fleets/#ssh-fleets): @@ -115,9 +117,9 @@ $ dstack apply -f crusoe-fleet.dstack.yml Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). -## Run NCCL tests +## NCCL tests -Use a [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-task) that runs NCCL tests to validate cluster network bandwidth. +Use a [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks) that runs NCCL tests to validate cluster network bandwidth. === "Crusoe Managed Kubernetes" @@ -253,9 +255,9 @@ Provisioning... nccl-tests provisioning completed (running) -# out-of-place in-place -# size count type redop root time algbw busbw #wrong time algbw busbw #wrong -# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) +out-of-place in-place + size count type redop root time algbw busbw #wrong time algbw busbw #wrong + (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) 8 2 float sum -1 27.70 0.00 0.00 0 29.82 0.00 0.00 0 16 4 float sum -1 28.78 0.00 0.00 0 28.99 0.00 0.00 0 32 8 float sum -1 28.49 0.00 0.00 0 28.16 0.00 0.00 0 @@ -285,8 +287,8 @@ nccl-tests provisioning completed (running) 536870912 134217728 float sum -1 5300.49 101.29 189.91 0 5314.91 101.01 189.40 0 1073741824 268435456 float sum -1 10472.2 102.53 192.25 0 10485.6 102.40 192.00 0 2147483648 536870912 float sum -1 20749.1 103.50 194.06 0 20745.7 103.51 194.09 0 -# Out of bounds values : 0 OK -# Avg bus bandwidth : 53.7387 + Out of bounds values : 0 OK + Avg bus bandwidth : 53.7387 ``` diff --git a/examples/clusters/lambda/README.md b/examples/clusters/lambda/README.md index 50a98bf6ed..07fb0ce926 100644 --- a/examples/clusters/lambda/README.md +++ b/examples/clusters/lambda/README.md @@ -5,18 +5,17 @@ description: Setting up Lambda clusters using Kubernetes or 1-Click Clusters wit # Lambda -[Lambda](https://lambda.ai/) offers two ways to use clusters with a fast interconnect: +`dstack` allows using Lambda clusters with fast interconnect via two ways: -* [Kubernetes](#kubernetes) – Lets you interact with clusters through the Kubernetes API and includes support for NVIDIA GPU operators and related tools. -* [1-Click Clusters (1CC)](#1-click-clusters) – Gives you direct access to clusters in the form of bare-metal nodes. - -Both options use the same underlying networking infrastructure. This example walks you through how to set up Lambda clusters to use with `dstack`. +* [Kubernetes](#kubernetes) – If you create a Kubernetes cluster on Lambda and configure a `kubernetes` backend and create a backend fleet in `dstack`, `dstack` lets you fully use this cluster through `dstack`. +* [VMs](#vms) – If you create a 1CC cluster on Lambda and create an SSH fleet in `dstack`, `dstack` lets you fully use this cluster through `dstack`. ## Kubernetes -!!! info "Prerequsisites" - 1. Follow the instructions in [Lambda's guide](https://docs.lambda.ai/public-cloud/1-click-clusters/managed-kubernetes/#accessing-mk8s) on accessing MK8s. - 2. Go to `Firewall` → `Edit rules`, click `Add rule`, and allow ingress traffic on port `30022`. This port will be used by the `dstack` server to access the jump host. +### Prerequsisites + +1. Follow the instructions in [Lambda's guide](https://docs.lambda.ai/public-cloud/1-click-clusters/managed-kubernetes/#accessing-mk8s) on accessing MK8s. +2. Go to `Firewall` → `Edit rules`, click `Add rule`, and allow ingress traffic on port `30022`. This port will be used by the `dstack` server to access the jump host. ### Configure the backend @@ -75,8 +74,9 @@ Once the fleet is created, you can run [dev environments](https://dstack.ai/docs Another way to work with Lambda clusters is through [1CC](https://lambda.ai/1-click-clusters). While `dstack` supports automated cluster provisioning via [VM-based backends](https://dstack.ai/docs/concepts/backends#vm-based), there is currently no programmatic way to provision Lambda 1CCs. As a result, to use a 1CC cluster with `dstack`, you must use [SSH fleets](https://dstack.ai/docs/concepts/fleets). -!!! info "Prerequsisites" - 1. Follow the instructions in [Lambda's guide](https://docs.lambda.ai/public-cloud/1-click-clusters/) on working with 1-Click Clusters +### Prerequsisites + +1. Follow the instructions in [Lambda's guide](https://docs.lambda.ai/public-cloud/1-click-clusters/) on working with 1-Click Clusters ### Create a fleet @@ -171,11 +171,11 @@ $ dstack apply -f lambda-nccl-tests.dstack.yml Provisioning... ---> 100% -# nccl-tests version 2.17.6 nccl-headers=22602 nccl-library=22602 -# Collective test starting: all_reduce_perf -# -# size count type redop root time algbw busbw #wrong time algbw busbw #wrong -# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + nccl-tests version 2.17.6 nccl-headers=22602 nccl-library=22602 + Collective test starting: all_reduce_perf + + size count type redop root time algbw busbw #wrong time algbw busbw #wrong + (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) 8 2 float sum -1 36.50 0.00 0.00 0 36.16 0.00 0.00 0 16 4 float sum -1 35.55 0.00 0.00 0 35.49 0.00 0.00 0 32 8 float sum -1 35.49 0.00 0.00 0 36.28 0.00 0.00 0 @@ -205,8 +205,8 @@ Provisioning... 536870912 134217728 float sum -1 1625.63 330.25 619.23 0 1687.31 318.18 596.59 0 1073741824 268435456 float sum -1 2972.25 361.26 677.35 0 2971.33 361.37 677.56 0 2147483648 536870912 float sum -1 5784.75 371.23 696.06 0 5728.40 374.88 702.91 0 -# Out of bounds values : 0 OK -# Avg bus bandwidth : 137.179 + Out of bounds values : 0 OK + Avg bus bandwidth : 137.179 ``` diff --git a/examples/clusters/nebius/README.md b/examples/clusters/nebius/README.md new file mode 100644 index 0000000000..70b41f8a87 --- /dev/null +++ b/examples/clusters/nebius/README.md @@ -0,0 +1,257 @@ +--- +title: Nebius +description: Using Nebius clusters with InfiniBand support via VMs or Kubernetes +--- + +# Nebius + +`dstack` allows you to use Nebius clusters with fast interconnects in two ways: + +* [VMs](#vms) – If you configure a `nebius` backend in `dstack` by providing your Nebius credentials, `dstack` lets you fully provision and use clusters through `dstack`. +* [Kubernetes](#kubernetes) – If you create a Kubernetes cluster on Nebius and configure a `kubernetes` backend and create a backend fleet in `dstack`, `dstack` lets you fully use this cluster through `dstack`. + +## VMs + +Since `dstack` offers a VM-based backend that natively integrates with Nebius, you only need to provide your Nebius credentials to `dstack`, and it will allow you to fully provision and use clusters on Nebius through `dstack`. + +### Configure a backend + +You can configure the `nebius` backend using a credentials file [generated](https://docs.nebius.com/iam/service-accounts/authorized-keys#create) by the `nebius` CLI: + +
+ +```shell +$ nebius iam auth-public-key generate \ + --service-account-id <service account ID> \ + --output ~/.nebius/sa-credentials.json +``` + +
+ +
+ +```yaml +projects: +- name: main + backends: + - type: nebius + creds: + type: service_account + filename: ~/.nebius/sa-credentials.json +``` + +
+ +### Create a fleet + +Once the backend configured, you can create a fleet: + +
+ +```yaml +type: fleet +name: nebius-fleet + +nodes: 2 +placement: cluster + +backends: [nebius] + +resources: + gpu: H100:8 +``` + +
+ +Pass the fleet configuration to `dstack apply`: + +
+ +```shell +$ dstack apply -f nebius-fleet.dstack.yml +``` + +
+ +This will automatically create a Nebius cluster and provision instances. + +Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). + +> If you want instances to be provisioned on demand, you can set `nodes` to `0..2`. In this case, `dstack` will create instances only when you run workloads. + +## Kubernetes + +If, for some reason, you’d like to use dstack with Nebius’s managed Kubernetes service, you can point `dstack` to the cluster’s kubeconfig file, and `dstack` will allow you to fully use this cluster through `dstack`. + +### Create a cluster + +1. Go to `Compute` → `Kubernetes` and click `Create cluster`. Make sure to enable `Public endpoint`. +2. Go to `Node groups` and click `Create node group`. Make sure to enable `Assign public IPv4 addresses` and `Install NVIDIA GPU drivers and other components`. Select the appropriate instance type, specify the `Number of nodes`, and set `Node storage` to at least `120 GiB`. Make sure to click `Create` under `GPU cluster` if you plan to use a fast interconnect. +3. Go to `Applications`, find `NVIDIA Device Plugin`, and click `Deploy`. +4. Wait until the nodes are provisioned. + +> Even if you enable `autoscaling`, `dstack` can use only the nodes that are already provisioned. To provision instances on demand, use [VMs](#vms) (see above). + +#### Configure the kubeconfig file + +1. Click `How to connect` and copy the `nebius` CLI command that configures the `kubeconfig` file. +2. Install the `nebius` CLI and run the command: + +
+ +```shell +$ nebius mk8s cluster get-credentials --id <cluster id> --external +``` + +
+ +### Configure a backend + +Follow the standard instructions for setting up a [`kubernetes`](https://dstack.ai/docs/concepts/backends/#kubernetes) backend: + +
+ +```yaml +projects: + - name: main + backends: + - type: kubernetes + kubeconfig: + filename: +``` + +
+ +### Create a fleet + +Once the cluster and the `dstack` server are running, you can create a fleet: + +
+ +```yaml +type: fleet +name: nebius-fleet + +placement: cluster +nodes: 0.. + +backends: [kubernetes] + +resources: + # Specify requirements to filter nodes + gpu: 8 +``` + +
+ +Pass the fleet configuration to `dstack apply`: + +
+ +```shell +$ dstack apply -f nebius-fleet.dstack.yml +``` + +
+ +Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). + +## NCCL tests + +Use a [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks) to run NCCL tests and validate the cluster’s network bandwidth. + +
+ +```yaml +type: task +name: nccl-tests + +nodes: 2 +startup_order: workers-first +stop_criteria: master-done + +env: + - NCCL_DEBUG=INFO +commands: + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 8G -f 2 -g 1 + else + sleep infinity + fi + +# Required for `/dev/infiniband` access +privileged: true + +resources: + gpu: 8 + shm_size: 16GB +``` + +
+ +Pass the configuration to `dstack apply`: + +
+ +```shell +$ dstack apply -f crusoe-nccl-tests.dstack.yml + +Provisioning... +---> 100% + +nccl-tests provisioning completed (running) + + out-of-place in-place + size count type redop root time algbw busbw #wrong time algbw busbw #wrong + (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8 2 float sum -1 45.72 0.00 0.00 0 29.78 0.00 0.00 0 + 16 4 float sum -1 29.92 0.00 0.00 0 29.42 0.00 0.00 0 + 32 8 float sum -1 30.10 0.00 0.00 0 29.75 0.00 0.00 0 + 64 16 float sum -1 34.48 0.00 0.00 0 29.36 0.00 0.00 0 + 128 32 float sum -1 30.38 0.00 0.01 0 29.67 0.00 0.01 0 + 256 64 float sum -1 30.48 0.01 0.02 0 29.97 0.01 0.02 0 + 512 128 float sum -1 30.45 0.02 0.03 0 30.85 0.02 0.03 0 + 1024 256 float sum -1 31.36 0.03 0.06 0 31.29 0.03 0.06 0 + 2048 512 float sum -1 32.27 0.06 0.12 0 32.26 0.06 0.12 0 + 4096 1024 float sum -1 36.04 0.11 0.21 0 43.17 0.09 0.18 0 + 8192 2048 float sum -1 37.24 0.22 0.41 0 35.54 0.23 0.43 0 + 16384 4096 float sum -1 37.22 0.44 0.83 0 34.55 0.47 0.89 0 + 32768 8192 float sum -1 43.82 0.75 1.40 0 35.64 0.92 1.72 0 + 65536 16384 float sum -1 37.85 1.73 3.25 0 37.55 1.75 3.27 0 + 131072 32768 float sum -1 43.10 3.04 5.70 0 53.08 2.47 4.63 0 + 262144 65536 float sum -1 58.59 4.47 8.39 0 63.33 4.14 7.76 0 + 524288 131072 float sum -1 97.88 5.36 10.04 0 83.91 6.25 11.72 0 + 1048576 262144 float sum -1 87.08 12.04 22.58 0 77.82 13.47 25.26 0 + 2097152 524288 float sum -1 99.06 21.17 39.69 0 97.67 21.47 40.26 0 + 4194304 1048576 float sum -1 110.14 38.08 71.40 0 114.66 36.58 68.59 0 + 8388608 2097152 float sum -1 154.48 54.30 101.82 0 156.03 53.76 100.80 0 + 16777216 4194304 float sum -1 210.33 79.77 149.56 0 200.98 83.48 156.52 0 + 33554432 8388608 float sum -1 274.23 122.36 229.43 0 276.45 121.38 227.58 0 + 67108864 16777216 float sum -1 472.43 142.05 266.35 0 480.00 139.81 262.14 0 + 134217728 33554432 float sum -1 759.58 176.70 331.31 0 756.21 177.49 332.79 0 + 268435456 67108864 float sum -1 1305.66 205.59 385.49 0 1303.37 205.95 386.16 0 + 536870912 134217728 float sum -1 2379.38 225.63 423.06 0 2373.42 226.20 424.13 0 + 1073741824 268435456 float sum -1 4511.97 237.98 446.21 0 4513.82 237.88 446.02 0 + 2147483648 536870912 float sum -1 8776.26 244.69 458.80 0 8760.42 245.13 459.63 0 + 4294967296 1073741824 float sum -1 17407.8 246.73 462.61 0 17302.2 248.23 465.44 0 + 8589934592 2147483648 float sum -1 34448.4 249.36 467.54 0 34381.0 249.85 468.46 0 + Out of bounds values : 0 OK + Avg bus bandwidth : 125.499 + + Collective test concluded: all_reduce_perf +``` + +
+ +## What's next + +1. Learn about [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), [services](https://dstack.ai/docs/concepts/services) +2. Check out [backends](https://dstack.ai/docs/concepts/backends) and [fleets](https://dstack.ai/docs/concepts/fleets) +3. Read Nebius' docs on [networking for VMs](https://docs.nebius.com/compute/clusters/gpu) and the [managed Kubernetes service](https://docs.nebius.com/kubernetes). diff --git a/mkdocs.yml b/mkdocs.yml index c98af0a2dd..ef745e6548 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -296,6 +296,7 @@ nav: - GCP: examples/clusters/gcp/index.md - Lambda: examples/clusters/lambda/index.md - Crusoe: examples/clusters/crusoe/index.md + - Nebius: examples/clusters/nebius/index.md - NCCL/RCCL tests: examples/clusters/nccl-rccl-tests/index.md - Inference: - SGLang: examples/inference/sglang/index.md From 4a63197a7e8f0b22b0c14c3502ccf277ed79ef07 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 12 Feb 2026 14:31:37 +0000 Subject: [PATCH 136/187] [Docs] Add get nodes rule to K8s ClusterRole (#3571) --- docs/docs/concepts/backends.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index da58f36e03..9dc2ebb573 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -1055,7 +1055,7 @@ projects: verbs: ["get", "create", "delete"] - apiGroups: [""] resources: ["nodes"] - verbs: ["list"] + verbs: ["list", "get"] ``` Ensure you've created a ClusterRoleBinding to grant the role to the user or the service account you're using. From 775aff09561c0c818eb7f7d5ff8344250864b8a6 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Fri, 13 Feb 2026 08:31:46 +0100 Subject: [PATCH 137/187] [Docs] Typo in the `Nebius` example under `Clusters` --- examples/clusters/nebius/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/clusters/nebius/README.md b/examples/clusters/nebius/README.md index 70b41f8a87..9f8bd349a0 100644 --- a/examples/clusters/nebius/README.md +++ b/examples/clusters/nebius/README.md @@ -201,7 +201,7 @@ Pass the configuration to `dstack apply`:
```shell -$ dstack apply -f crusoe-nccl-tests.dstack.yml +$ dstack apply -f nebius-nccl-tests.dstack.yml Provisioning... ---> 100% From 5132bb90d8265db15dfa3c20c303d1505f6dfbf1 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Mon, 16 Feb 2026 09:33:33 +0100 Subject: [PATCH 138/187] [Docs] Clarified the behavior of idle duration: how run's `idle_duration` and fleet's `idle_duration` are applied (#3574) --- docs/docs/concepts/snippets/manage-fleets.ext | 12 ++++++++---- docs/docs/guides/protips.md | 11 +++++++---- src/dstack/_internal/core/models/profiles.py | 6 ++++-- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/docs/docs/concepts/snippets/manage-fleets.ext b/docs/docs/concepts/snippets/manage-fleets.ext index a9d5c2d3e4..c9835fc679 100644 --- a/docs/docs/concepts/snippets/manage-fleets.ext +++ b/docs/docs/concepts/snippets/manage-fleets.ext @@ -1,6 +1,10 @@ ### Idle duration -If a run provisions a new instance, the instance stays `idle` for 5 minutes by default and can be reused within that time. -If the instance is not reused within this period, it is automatically terminated. -To change the default idle duration, set -[`idle_duration`](../reference/dstack.yml/fleet.md#idle_duration) in the run configuration (e.g., `0s`, `1m`, or `off` for unlimited). +If the run is submitted to a fleet with `nodes` set to a range and a new instance is provisioned, +the shorter of the fleet's and run's `idle_duration` is used. +If the run reuses an existing fleet instance, only the fleet's +[`idle_duration`](../reference/dstack.yml/fleet.md#idle_duration) applies. + +If an instance remains `idle`, it is automatically terminated after `idle_duration`. + +> Not applied for container-based backends (Kubernetes, Vast.ai, RunPod). diff --git a/docs/docs/guides/protips.md b/docs/docs/guides/protips.md index cc31e9f277..4aa5df93fb 100644 --- a/docs/docs/guides/protips.md +++ b/docs/docs/guides/protips.md @@ -212,10 +212,13 @@ Or, set [`creation_policy`](../reference/dstack.yml/dev-environment.md#creation_ ### Idle duration -If a run provisions a new instance, the instance stays `idle` for 5 minutes by default and can be reused within that time. -If the instance is not reused within this period, it is automatically terminated. -To change the default idle duration, set -[`idle_duration`](../reference/dstack.yml/fleet.md#idle_duration) in the run configuration (e.g., `0s`, `1m`, or `off` for unlimited). +If the run is submitted to a fleet with `nodes` set to a range and a new instance is provisioned, the shorter of the fleet's and run's `idle_duration` is used. +If the run reuses an existing fleet instance, only the fleet's +[`idle_duration`](../reference/dstack.yml/fleet.md#idle_duration) applies. + +If an instance remains `idle`, it is automatically terminated after `idle_duration`. + +> Not applied for container-based backends (Kubernetes, Vast.ai, RunPod). ## Volumes diff --git a/src/dstack/_internal/core/models/profiles.py b/src/dstack/_internal/core/models/profiles.py index f69f094028..97ce591aad 100644 --- a/src/dstack/_internal/core/models/profiles.py +++ b/src/dstack/_internal/core/models/profiles.py @@ -323,9 +323,11 @@ class ProfileParams(CoreModel): Field( description=( "Time to wait before terminating idle instances." - " Instances are not terminated if the fleet is already at `nodes.min`." + " When the run reuses an existing fleet instance, the fleet's `idle_duration` applies." + " When the run provisions a new instance, the shorter of the fleet's and run's values is used." " Defaults to `5m` for runs and `3d` for fleets." - " Use `off` for unlimited duration" + " Use `off` for unlimited duration." + " Only applied for VM-based backends" ) ), ] = None From ad9fd69580703ec68b3c35402a807c8c7fac7a57 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Mon, 16 Feb 2026 10:34:49 +0000 Subject: [PATCH 139/187] [runner] Don't bind to public addresses (#3575) Closes: https://github.com/dstackai/dstack/issues/3078 --- runner/cmd/runner/main.go | 19 ++++++++++++++++--- runner/internal/shim/docker.go | 23 +++++++++++++++++------ runner/internal/shim/docker_test.go | 2 +- runner/internal/shim/models.go | 2 +- 4 files changed, 35 insertions(+), 11 deletions(-) diff --git a/runner/cmd/runner/main.go b/runner/cmd/runner/main.go index c8125dc848..8a62fd6f50 100644 --- a/runner/cmd/runner/main.go +++ b/runner/cmd/runner/main.go @@ -31,6 +31,7 @@ func main() { func mainInner() int { var tempDir string + var httpAddress string var httpPort int var sshPort int var sshAuthorizedKeys []string @@ -61,6 +62,13 @@ func mainInner() int { Destination: &tempDir, TakesFile: true, }, + &cli.StringFlag{ + Name: "http-address", + Usage: "Set a http bind address", + Value: "", + DefaultText: "all interfaces", + Destination: &httpAddress, + }, &cli.IntFlag{ Name: "http-port", Usage: "Set a http port", @@ -86,7 +94,7 @@ func mainInner() int { }, }, Action: func(ctx context.Context, cmd *cli.Command) error { - return start(ctx, tempDir, httpPort, sshPort, sshAuthorizedKeys, logLevel, Version) + return start(ctx, logLevel, tempDir, httpAddress, httpPort, sshPort, sshAuthorizedKeys) }, }, }, @@ -103,7 +111,12 @@ func mainInner() int { return 0 } -func start(ctx context.Context, tempDir string, httpPort int, sshPort int, sshAuthorizedKeys []string, logLevel int, version string) error { +func start( + ctx context.Context, + logLevel int, tempDir string, + httpAddress string, httpPort int, + sshPort int, sshAuthorizedKeys []string, +) error { if err := os.MkdirAll(tempDir, 0o755); err != nil { return fmt.Errorf("create temp directory: %w", err) } @@ -191,7 +204,7 @@ func start(ctx context.Context, tempDir string, httpPort int, sshPort int, sshAu return fmt.Errorf("create executor: %w", err) } - server, err := api.NewServer(ctx, fmt.Sprintf(":%d", httpPort), version, ex) + server, err := api.NewServer(ctx, fmt.Sprintf("%s:%d", httpAddress, httpPort), Version, ex) if err != nil { return fmt.Errorf("create server: %w", err) } diff --git a/runner/internal/shim/docker.go b/runner/internal/shim/docker.go index 1fd8d959af..88a7f37c02 100644 --- a/runner/internal/shim/docker.go +++ b/runner/internal/shim/docker.go @@ -806,8 +806,6 @@ func (d *DockerRunner) createContainer(ctx context.Context, task *Task) error { } mounts = append(mounts, instanceMounts...) - ports := d.dockerParams.DockerPorts() - // Set the environment variables envVars := []string{} if d.dockerParams.DockerPJRTDevice() != "" { @@ -827,9 +825,19 @@ func (d *DockerRunner) createContainer(ctx context.Context, task *Task) error { } } + networkMode := getNetworkMode(task.config.NetworkMode) + ports := d.dockerParams.DockerPorts() + + // Bridge mode - all interfaces + runnerHttpAddress := "" + if networkMode.IsHost() { + runnerHttpAddress = "localhost" + } + shellCommands := d.dockerParams.DockerShellCommands(task.config.ContainerSshKeys, runnerHttpAddress) + containerConfig := &container.Config{ Image: task.config.ImageName, - Cmd: []string{strings.Join(d.dockerParams.DockerShellCommands(task.config.ContainerSshKeys), " && ")}, + Cmd: []string{strings.Join(shellCommands, " && ")}, Entrypoint: []string{"/bin/sh", "-c"}, ExposedPorts: exposePorts(ports), Env: envVars, @@ -843,7 +851,7 @@ func (d *DockerRunner) createContainer(ctx context.Context, task *Task) error { } hostConfig := &container.HostConfig{ Privileged: task.config.Privileged || d.dockerParams.DockerPrivileged(), - NetworkMode: getNetworkMode(task.config.NetworkMode), + NetworkMode: networkMode, PortBindings: bindPorts(ports), Mounts: mounts, ShmSize: task.config.ShmSize, @@ -1182,7 +1190,7 @@ func (c *CLIArgs) DockerPJRTDevice() string { return c.Docker.PJRTDevice } -func (c *CLIArgs) DockerShellCommands(publicKeys []string) []string { +func (c *CLIArgs) DockerShellCommands(authorizedKeys []string, runnerHttpAddress string) []string { commands := getSSHShellCommands() runnerCommand := []string{ consts.RunnerBinaryPath, @@ -1192,7 +1200,10 @@ func (c *CLIArgs) DockerShellCommands(publicKeys []string) []string { "--http-port", strconv.Itoa(c.Runner.HTTPPort), "--ssh-port", strconv.Itoa(c.Runner.SSHPort), } - for _, key := range publicKeys { + if runnerHttpAddress != "" { + runnerCommand = append(runnerCommand, "--http-address", runnerHttpAddress) + } + for _, key := range authorizedKeys { runnerCommand = append(runnerCommand, "--ssh-authorized-key", fmt.Sprintf("'%s'", key)) } return append(commands, strings.Join(runnerCommand, " ")) diff --git a/runner/internal/shim/docker_test.go b/runner/internal/shim/docker_test.go index faa31bbc06..18f8c31fca 100644 --- a/runner/internal/shim/docker_test.go +++ b/runner/internal/shim/docker_test.go @@ -110,7 +110,7 @@ func (c *dockerParametersMock) DockerPJRTDevice() string { return "" } -func (c *dockerParametersMock) DockerShellCommands(publicKeys []string) []string { +func (c *dockerParametersMock) DockerShellCommands(authorizedKeys []string, runnerHttpAddress string) []string { commands := make([]string, 0) if c.sshShellCommands { commands = append(commands, getSSHShellCommands()...) diff --git a/runner/internal/shim/models.go b/runner/internal/shim/models.go index 5952286507..d50fe6e297 100644 --- a/runner/internal/shim/models.go +++ b/runner/internal/shim/models.go @@ -6,7 +6,7 @@ import ( type DockerParameters interface { DockerPrivileged() bool - DockerShellCommands([]string) []string + DockerShellCommands(authorizedKeys []string, runnerHttpAddress string) []string DockerMounts(string) ([]mount.Mount, error) DockerPorts() []int MakeRunnerDir(name string) (string, error) From 5cc60b710f0cbed729de6b9857dfa9eb3415969e Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Tue, 17 Feb 2026 10:01:48 +0100 Subject: [PATCH 140/187] Migrate service model base url (#3560) * - Assign `service.model.base_url` to `service.url` + `prefix` (e.g. `/v1`) if `model` has `openai` format. - Add CORS support to the gateway service endppoint (if `model` has `openai` format) * - Ensure CORS logic works even if project has multiple services with the same model name * Add gateway services state migration for CORS support in services for backward compatibility (with older verions of gateway) * Update SKILL.md to reflect service.model.base_url changes Now that service.model.base_url points to service.url + /v1 for openai-format models, it is no longer deprecated and can be recommended as the model endpoint. Co-authored-by: Cursor * Shorten SKILL.md model endpoint wording Co-authored-by: Cursor * PR feedback: Refactor `get_nginx_service_config` to use `service.cors_enabled`. --- skills/dstack/SKILL.md | 2 +- .../_internal/core/models/configurations.py | 3 +- .../gateway/resources/nginx/service.jinja2 | 29 +++++++++++++++ .../_internal/proxy/gateway/services/nginx.py | 1 + .../proxy/gateway/services/registry.py | 28 +++++++++++++++ src/dstack/_internal/proxy/lib/models.py | 1 + .../services/jobs/configurators/base.py | 11 +++--- .../server/services/services/__init__.py | 19 +++++++--- .../_internal/server/routers/test_runs.py | 36 +++++++++++++++---- .../jobs/configurators/test_service.py | 7 ++-- 10 files changed, 114 insertions(+), 23 deletions(-) diff --git a/skills/dstack/SKILL.md b/skills/dstack/SKILL.md index 8bad2b5c79..1d362c8520 100644 --- a/skills/dstack/SKILL.md +++ b/skills/dstack/SKILL.md @@ -222,7 +222,7 @@ resources: - Without gateway: `/proxy/services/f//` - With gateway: `https://./` - Authentication: Unless `auth` is `false`, include `Authorization: Bearer ` on all service requests. -- OpenAI-compatible models: Use `service.url` from `dstack run get --json` and append `/v1` as the base URL; do **not** use deprecated `service.model.base_url` for requests. +- Model endpoint: If `model` is set, `service.model.base_url` from `dstack run get --json` provides the model endpoint. For OpenAI-compatible models (the default, unless format is set otherwise), this will be `service.url` + `/v1`. - Example (with gateway): ```bash curl -sS -X POST "https://./v1/chat/completions" \ diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index db965a7697..040c382359 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -57,8 +57,7 @@ DEFAULT_PROBE_UNTIL_READY = False MAX_PROBE_URL_LEN = 2048 DEFAULT_REPLICA_GROUP_NAME = "0" -DEFAULT_MODEL_PROBE_TIMEOUT = 30 -DEFAULT_MODEL_PROBE_URL = "/v1/chat/completions" +OPENAI_MODEL_PROBE_TIMEOUT = 30 class RunConfigurationType(str, Enum): diff --git a/src/dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 b/src/dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 index 31f987706a..521e6a23fb 100644 --- a/src/dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +++ b/src/dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 @@ -24,6 +24,17 @@ server { {% for location in locations %} location {{ location.prefix }} { + {% if cors_enabled %} + # Handle CORS preflight before auth (rewrite phase runs before access phase) + if ($request_method = 'OPTIONS') { + add_header 'Access-Control-Allow-Origin' '*' always; + add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, PATCH, OPTIONS, HEAD' always; + add_header 'Access-Control-Allow-Headers' '*' always; + add_header 'Access-Control-Max-Age' '600' always; + return 204; + } + {% endif %} + {% if auth %} auth_request /_dstack_auth; {% endif %} @@ -46,6 +57,15 @@ server { location @websocket { set $dstack_replica_hit 1; {% if replicas %} + {% if cors_enabled %} + proxy_hide_header 'Access-Control-Allow-Origin'; + proxy_hide_header 'Access-Control-Allow-Methods'; + proxy_hide_header 'Access-Control-Allow-Headers'; + proxy_hide_header 'Access-Control-Allow-Credentials'; + add_header 'Access-Control-Allow-Origin' '*' always; + add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, PATCH, OPTIONS, HEAD' always; + add_header 'Access-Control-Allow-Headers' '*' always; + {% endif %} proxy_pass http://{{ domain }}.upstream; proxy_set_header X-Real-IP $remote_addr; proxy_set_header Host $host; @@ -60,6 +80,15 @@ server { location @ { set $dstack_replica_hit 1; {% if replicas %} + {% if cors_enabled %} + proxy_hide_header 'Access-Control-Allow-Origin'; + proxy_hide_header 'Access-Control-Allow-Methods'; + proxy_hide_header 'Access-Control-Allow-Headers'; + proxy_hide_header 'Access-Control-Allow-Credentials'; + add_header 'Access-Control-Allow-Origin' '*' always; + add_header 'Access-Control-Allow-Methods' 'GET, POST, PUT, DELETE, PATCH, OPTIONS, HEAD' always; + add_header 'Access-Control-Allow-Headers' '*' always; + {% endif %} proxy_pass http://{{ domain }}.upstream; proxy_set_header X-Real-IP $remote_addr; proxy_set_header Host $host; diff --git a/src/dstack/_internal/proxy/gateway/services/nginx.py b/src/dstack/_internal/proxy/gateway/services/nginx.py index bbda92d91b..c971d4197a 100644 --- a/src/dstack/_internal/proxy/gateway/services/nginx.py +++ b/src/dstack/_internal/proxy/gateway/services/nginx.py @@ -72,6 +72,7 @@ class ServiceConfig(SiteConfig): replicas: list[ReplicaConfig] router: Optional[AnyRouterConfig] = None router_port: Optional[int] = None + cors_enabled: bool = False class ModelEntrypointConfig(SiteConfig): diff --git a/src/dstack/_internal/proxy/gateway/services/registry.py b/src/dstack/_internal/proxy/gateway/services/registry.py index 636d8c38ec..036b864396 100644 --- a/src/dstack/_internal/proxy/gateway/services/registry.py +++ b/src/dstack/_internal/proxy/gateway/services/registry.py @@ -47,6 +47,7 @@ async def register_service( service_conn_pool: ServiceConnectionPool, router: Optional[AnyRouterConfig] = None, ) -> None: + cors_enabled = model is not None and model.type == "chat" and model.format == "openai" service = models.Service( project_name=project_name, run_name=run_name, @@ -57,6 +58,7 @@ async def register_service( client_max_body_size=client_max_body_size, replicas=(), router=router, + cors_enabled=cors_enabled, ) async with lock: @@ -374,6 +376,7 @@ async def get_nginx_service_config( locations=locations, replicas=sorted(replicas, key=lambda r: r.id), # sort for reproducible configs router=service.router, + cors_enabled=service.cors_enabled, ) @@ -389,9 +392,34 @@ async def apply_entrypoint( await nginx.register(config, acme) +async def _migrate_cors_enabled(repo: GatewayProxyRepo) -> None: + """Migrate services registered before the cors_enabled field was added. + + Old gateway versions didn't persist cors_enabled on services. This derives it + from the associated model's format so that CORS is enabled for openai-format + models on gateway restart without requiring service re-registration. + """ + services = await repo.list_services() + openai_run_names: set[tuple[str, str]] = set() + for service in services: + for model in await repo.list_models(service.project_name): + if model.run_name == service.run_name and isinstance( + model.format_spec, models.OpenAIChatModelFormat + ): + openai_run_names.add((service.project_name, service.run_name)) + for service in services: + if ( + not service.cors_enabled + and (service.project_name, service.run_name) in openai_run_names + ): + updated = models.Service(**{**service.dict(), "cors_enabled": True}) + await repo.set_service(updated) + + async def apply_all( repo: GatewayProxyRepo, nginx: Nginx, service_conn_pool: ServiceConnectionPool ) -> None: + await _migrate_cors_enabled(repo) service_tasks = [ apply_service( service=service, diff --git a/src/dstack/_internal/proxy/lib/models.py b/src/dstack/_internal/proxy/lib/models.py index bf37e0b5aa..f304bbc394 100644 --- a/src/dstack/_internal/proxy/lib/models.py +++ b/src/dstack/_internal/proxy/lib/models.py @@ -59,6 +59,7 @@ class Service(ImmutableModel): strip_prefix: bool = True # only used in-server replicas: tuple[Replica, ...] router: Optional[AnyRouterConfig] = None + cors_enabled: bool = False # only used on gateways; enabled for openai-format models @property def domain_safe(self) -> str: diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index 3b6038ccd9..a9496ad348 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -12,8 +12,6 @@ from dstack._internal.core.errors import DockerRegistryError, ServerClientError from dstack._internal.core.models.common import RegistryAuth from dstack._internal.core.models.configurations import ( - DEFAULT_MODEL_PROBE_TIMEOUT, - DEFAULT_MODEL_PROBE_URL, DEFAULT_PROBE_INTERVAL, DEFAULT_PROBE_METHOD, DEFAULT_PROBE_READY_AFTER, @@ -22,6 +20,7 @@ DEFAULT_PROBE_URL, DEFAULT_REPLICA_GROUP_NAME, LEGACY_REPO_DIR, + OPENAI_MODEL_PROBE_TIMEOUT, HTTPHeaderSpec, PortMapping, ProbeConfig, @@ -406,7 +405,7 @@ def _probes(self) -> list[ProbeSpec]: # Generate default probe if model is set model = self.run_spec.configuration.model if isinstance(model, OpenAIChatModel): - return [_default_model_probe_spec(model.name)] + return [_openai_model_probe_spec(model.name, model.prefix)] return [] @@ -460,7 +459,7 @@ def _probe_config_to_spec(c: ProbeConfig) -> ProbeSpec: ) -def _default_model_probe_spec(model_name: str) -> ProbeSpec: +def _openai_model_probe_spec(model_name: str, prefix: str) -> ProbeSpec: body = orjson.dumps( { "model": model_name, @@ -471,12 +470,12 @@ def _default_model_probe_spec(model_name: str) -> ProbeSpec: return ProbeSpec( type="http", method="post", - url=DEFAULT_MODEL_PROBE_URL, + url=prefix.rstrip("/") + "/chat/completions", headers=[ HTTPHeaderSpec(name="Content-Type", value="application/json"), ], body=body, - timeout=DEFAULT_MODEL_PROBE_TIMEOUT, + timeout=OPENAI_MODEL_PROBE_TIMEOUT, interval=DEFAULT_PROBE_INTERVAL, ready_after=DEFAULT_PROBE_READY_AFTER, ) diff --git a/src/dstack/_internal/server/services/services/__init__.py b/src/dstack/_internal/server/services/services/__init__.py index 06aa5b0ef0..511cf7cc93 100644 --- a/src/dstack/_internal/server/services/services/__init__.py +++ b/src/dstack/_internal/server/services/services/__init__.py @@ -27,6 +27,7 @@ from dstack._internal.core.models.gateways import GatewayConfiguration, GatewayStatus from dstack._internal.core.models.instances import SSHConnectionParams from dstack._internal.core.models.runs import JobSpec, Run, RunSpec, ServiceModelSpec, ServiceSpec +from dstack._internal.core.models.services import OpenAIChatModel from dstack._internal.server import settings from dstack._internal.server.models import GatewayModel, JobModel, ProjectModel, RunModel from dstack._internal.server.services import events @@ -106,10 +107,15 @@ async def _register_service_in_gateway( wildcard_domain = gateway.wildcard_domain.lstrip("*.") if gateway.wildcard_domain else None if wildcard_domain is None: raise ServerClientError("Domain is required for gateway") + service_url = f"{service_protocol}://{run_model.run_name}.{wildcard_domain}" + if isinstance(run_spec.configuration.model, OpenAIChatModel): + model_url = service_url + run_spec.configuration.model.prefix + else: + model_url = f"{gateway_protocol}://gateway.{wildcard_domain}" service_spec = get_service_spec( configuration=run_spec.configuration, - service_url=f"{service_protocol}://{run_model.run_name}.{wildcard_domain}", - model_url=f"{gateway_protocol}://gateway.{wildcard_domain}", + service_url=service_url, + model_url=model_url, ) domain = service_spec.get_domain() @@ -173,10 +179,15 @@ def _register_service_in_server(run_model: RunModel, run_spec: RunSpec) -> Servi "Rate limits are not supported when running services without a gateway." " Please configure a gateway or remove `rate_limits` from the service configuration" ) + service_url = f"/proxy/services/{run_model.project.name}/{run_model.run_name}/" + if isinstance(run_spec.configuration.model, OpenAIChatModel): + model_url = service_url.rstrip("/") + run_spec.configuration.model.prefix + else: + model_url = f"/proxy/models/{run_model.project.name}/" return get_service_spec( configuration=run_spec.configuration, - service_url=f"/proxy/services/{run_model.project.name}/{run_model.run_name}/", - model_url=f"/proxy/models/{run_model.project.name}/", + service_url=service_url, + model_url=model_url, ) diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index ad8ad878d1..be78414a9e 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -588,6 +588,7 @@ def get_service_run_spec( repo_id: str, run_name: Optional[str] = None, gateway: Optional[Union[bool, str]] = None, + model: Union[str, dict] = "test-model", ) -> dict: return { "configuration": { @@ -595,7 +596,7 @@ def get_service_run_spec( "commands": ["python -m http.server"], "port": 8000, "gateway": gateway, - "model": "test-model", + "model": model, "repos": [ { "url": "https://github.com/dstackai/dstack", @@ -2303,48 +2304,69 @@ def mock_gateway_connections(self) -> Generator[None, None, None]: "expected_service_url", "expected_model_url", "is_gateway", + "model", ), [ pytest.param( [("default-gateway", True), ("non-default-gateway", False)], None, "https://test-service.default-gateway.example", - "https://gateway.default-gateway.example", + "https://test-service.default-gateway.example/v1", True, + "test-model", id="submits-to-default-gateway", ), pytest.param( [("default-gateway", True), ("non-default-gateway", False)], True, "https://test-service.default-gateway.example", - "https://gateway.default-gateway.example", + "https://test-service.default-gateway.example/v1", True, + "test-model", id="submits-to-default-gateway-when-gateway-true", ), pytest.param( [("default-gateway", True), ("non-default-gateway", False)], "non-default-gateway", "https://test-service.non-default-gateway.example", - "https://gateway.non-default-gateway.example", + "https://test-service.non-default-gateway.example/v1", True, + "test-model", id="submits-to-specified-gateway", ), pytest.param( [("non-default-gateway", False)], None, "/proxy/services/test-project/test-service/", - "/proxy/models/test-project/", + "/proxy/services/test-project/test-service/v1", False, + "test-model", id="submits-in-server-when-no-default-gateway", ), pytest.param( [("default-gateway", True)], False, "/proxy/services/test-project/test-service/", - "/proxy/models/test-project/", + "/proxy/services/test-project/test-service/v1", False, + "test-model", id="submits-in-server-when-specified", ), + pytest.param( + [("default-gateway", True)], + None, + "https://test-service.default-gateway.example", + "https://gateway.default-gateway.example", + True, + { + "type": "chat", + "name": "test-model", + "format": "tgi", + "chat_template": "test", + "eos_token": "", + }, + id="submits-tgi-model-to-gateway", + ), ], ) async def test_submit_to_correct_proxy( @@ -2357,6 +2379,7 @@ async def test_submit_to_correct_proxy( expected_service_url: str, expected_model_url: str, is_gateway: bool, + model: Union[str, dict], ) -> None: user = await create_user(session=session, global_role=GlobalRole.USER) project = await create_project(session=session, owner=user, name="test-project") @@ -2386,6 +2409,7 @@ async def test_submit_to_correct_proxy( repo_id=repo.name, run_name="test-service", gateway=specified_gateway_in_run_conf, + model=model, ) response = await client.post( f"/api/project/{project.name}/runs/submit", diff --git a/src/tests/_internal/server/services/jobs/configurators/test_service.py b/src/tests/_internal/server/services/jobs/configurators/test_service.py index b52ee297a5..cafab73d9c 100644 --- a/src/tests/_internal/server/services/jobs/configurators/test_service.py +++ b/src/tests/_internal/server/services/jobs/configurators/test_service.py @@ -1,8 +1,7 @@ import pytest from dstack._internal.core.models.configurations import ( - DEFAULT_MODEL_PROBE_TIMEOUT, - DEFAULT_MODEL_PROBE_URL, + OPENAI_MODEL_PROBE_TIMEOUT, ProbeConfig, ServiceConfiguration, ) @@ -35,8 +34,8 @@ async def test_default_probe_when_model_set(self): probe = probes[0] assert probe.type == "http" assert probe.method == "post" - assert probe.url == DEFAULT_MODEL_PROBE_URL - assert probe.timeout == DEFAULT_MODEL_PROBE_TIMEOUT + assert probe.url == "/v1/chat/completions" + assert probe.timeout == OPENAI_MODEL_PROBE_TIMEOUT assert len(probe.headers) == 1 assert probe.headers[0].name == "Content-Type" assert probe.headers[0].value == "application/json" From 3d8c9ba182ec839bbb1e774489a28ef7054216d6 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Tue, 17 Feb 2026 11:06:45 +0100 Subject: [PATCH 141/187] Set explicit GPU defaults in ResourcesSpec and improve default GPU vendor selection (#3573) * Set explicit GPU default (`0..`) in `ResourcesSpec` and minor improvements in resource pretty-printing * Change how GPU vendor default is set to make it more explicit - Default to NVIDIA only if user has no image - Keep backward compatibility with old/new server/CLI - Make `dstack offer` consistent with `dstack apply` --- src/dstack/_internal/cli/commands/offer.py | 5 +- .../cli/services/configurators/run.py | 20 ++++- src/dstack/_internal/core/models/resources.py | 7 +- .../_internal/server/services/resources.py | 23 ++++++ .../server/services/runs/__init__.py | 27 +++++-- .../_internal/server/services/runs/spec.py | 10 ++- src/dstack/_internal/utils/common.py | 28 +++---- .../cli/services/configurators/test_run.py | 23 +++++- .../_internal/server/routers/test_fleets.py | 27 ++++++- .../_internal/server/routers/test_runs.py | 11 +++ src/tests/_internal/utils/test_common.py | 75 +++++++++++++++++++ 11 files changed, 219 insertions(+), 37 deletions(-) diff --git a/src/dstack/_internal/cli/commands/offer.py b/src/dstack/_internal/cli/commands/offer.py index 0e4be1d5c2..dd81231536 100644 --- a/src/dstack/_internal/cli/commands/offer.py +++ b/src/dstack/_internal/cli/commands/offer.py @@ -74,7 +74,10 @@ def _register(self): def _command(self, args: argparse.Namespace): super()._command(args) - conf = TaskConfiguration(commands=[":"]) + # Set image and user so that the server (a) does not default gpu.vendor + # to nvidia — `dstack offer` should show all vendors, and (b) does not + # attempt to pull image config from the Docker registry. + conf = TaskConfiguration(commands=[":"], image="scratch", user="root") configurator = OfferConfigurator(api_client=self.api) configurator.apply_args(conf, args) diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py index 33ba4e10d2..c4c4d3488a 100644 --- a/src/dstack/_internal/cli/services/configurators/run.py +++ b/src/dstack/_internal/cli/services/configurators/run.py @@ -383,7 +383,13 @@ def interpolate_env(self, conf: RunConfigurationT): def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None: """ - Infers and sets `resources.gpu.vendor` if not set, requires `image` if the vendor is AMD. + Infers GPU vendor if not set. Defaults to Nvidia when using the default + CUDA image. Requires explicit `image` if the vendor is AMD or Tenstorrent. + + NOTE: We don't set the inferred vendor on gpu_spec for compatibility with + older servers. Servers set the vendor using the same logic in + set_resources_defaults(). The inferred vendor is used here only for + validation and display (see _infer_gpu_vendor). """ gpu_spec = conf.resources.gpu if gpu_spec is None: @@ -425,12 +431,18 @@ def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None: # CUDA image, not a big deal. has_amd_gpu = gpuhunt.AcceleratorVendor.AMD in vendors has_tt_gpu = gpuhunt.AcceleratorVendor.TENSTORRENT in vendors + # Set vendor inferred from name on the spec (server needs it for filtering). + gpu_spec.vendor = vendor else: - # If neither gpu.vendor nor gpu.name is set, assume Nvidia. - vendor = gpuhunt.AcceleratorVendor.NVIDIA + # No vendor or name specified. Default to Nvidia if using the default + # CUDA image, since it's only compatible with Nvidia GPUs. + # We don't set the inferred vendor on the spec — the server does the + # same inference in set_resources_defaults() for compatibility with + # older servers that don't handle vendor + count.min=0 correctly. + if conf.image is None and conf.docker is not True: + vendor = gpuhunt.AcceleratorVendor.NVIDIA has_amd_gpu = False has_tt_gpu = False - gpu_spec.vendor = vendor else: has_amd_gpu = vendor == gpuhunt.AcceleratorVendor.AMD has_tt_gpu = vendor == gpuhunt.AcceleratorVendor.TENSTORRENT diff --git a/src/dstack/_internal/core/models/resources.py b/src/dstack/_internal/core/models/resources.py index 20b4f3aa55..02cbbdc9b8 100644 --- a/src/dstack/_internal/core/models/resources.py +++ b/src/dstack/_internal/core/models/resources.py @@ -319,6 +319,9 @@ def _vendor_from_string(cls, v: str) -> gpuhunt.AcceleratorVendor: return gpuhunt.AcceleratorVendor.cast(v) +DEFAULT_GPU_SPEC = GPUSpec(count=Range[int](min=0, max=None)) + + class DiskSpecConfig(CoreConfig): @staticmethod def schema_extra(schema: Dict[str, Any]): @@ -387,7 +390,8 @@ class ResourcesSpec(generate_dual_core_model(ResourcesSpecConfig)): "you may need to configure this" ), ] = None - gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None + # Optional for backward compatibility + gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = DEFAULT_GPU_SPEC disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK def pretty_format(self) -> str: @@ -397,6 +401,7 @@ def pretty_format(self) -> str: if self.gpu: gpu = self.gpu resources.update( + gpu_vendor=gpu.vendor, gpu_name=",".join(gpu.name) if gpu.name else None, gpu_count=gpu.count, gpu_memory=gpu.memory, diff --git a/src/dstack/_internal/server/services/resources.py b/src/dstack/_internal/server/services/resources.py index 17cd80a662..aab47de21c 100644 --- a/src/dstack/_internal/server/services/resources.py +++ b/src/dstack/_internal/server/services/resources.py @@ -1,3 +1,5 @@ +from typing import Optional + import gpuhunt from pydantic import parse_obj_as @@ -19,3 +21,24 @@ def set_resources_defaults(resources: ResourcesSpec) -> None: else: cpu.arch = gpuhunt.CPUArchitecture.X86 resources.cpu = cpu + + +def set_gpu_vendor_default( + resources: ResourcesSpec, + image: Optional[str], + docker: Optional[bool], +) -> None: + """Default GPU vendor to Nvidia when using the default CUDA image, + since it's only compatible with Nvidia GPUs. + Mirrors the client-side logic in validate_gpu_vendor_and_image(). + Should only be called for runs (not fleets) since fleets don't have image context.""" + gpu = resources.gpu + if ( + gpu is not None + and gpu.vendor is None + and gpu.name is None + and gpu.count.max != 0 + and image is None + and docker is not True + ): + gpu.vendor = gpuhunt.AcceleratorVendor.NVIDIA diff --git a/src/dstack/_internal/server/services/runs/__init__.py b/src/dstack/_internal/server/services/runs/__init__.py index 5ae19b348f..73966916b5 100644 --- a/src/dstack/_internal/server/services/runs/__init__.py +++ b/src/dstack/_internal/server/services/runs/__init__.py @@ -65,7 +65,10 @@ from dstack._internal.server.services.plugins import apply_plugin_policies from dstack._internal.server.services.probes import is_probe_ready from dstack._internal.server.services.projects import list_user_project_models -from dstack._internal.server.services.resources import set_resources_defaults +from dstack._internal.server.services.resources import ( + set_gpu_vendor_default, + set_resources_defaults, +) from dstack._internal.server.services.runs.plan import get_job_plans from dstack._internal.server.services.runs.spec import ( can_update_run_spec, @@ -343,8 +346,8 @@ async def get_plan( ) if current_resource is not None: # For backward compatibility (current_resource may has been submitted before - # some fields, e.g., CPUSpec.arch, were added) - set_resources_defaults(current_resource.run_spec.configuration.resources) + # some fields, e.g., CPUSpec.arch, gpu.vendor were added) + _set_run_resources_defaults(current_resource.run_spec) if not current_resource.status.is_finished() and can_update_run_spec( current_resource.run_spec, effective_run_spec ): @@ -354,7 +357,7 @@ async def get_plan( session=session, project=project, profile=profile, - run_spec=run_spec, + run_spec=effective_run_spec, max_offers=max_offers, ) run_plan = RunPlan( @@ -410,8 +413,8 @@ async def apply_plan( current_resource = run_model_to_run(current_resource_model, return_in_api=True) # For backward compatibility (current_resource may has been submitted before - # some fields, e.g., CPUSpec.arch, were added) - set_resources_defaults(current_resource.run_spec.configuration.resources) + # some fields, e.g., CPUSpec.arch, gpu.vendor were added) + _set_run_resources_defaults(current_resource.run_spec) try: spec_diff = check_can_update_run_spec(current_resource.run_spec, run_spec) except ServerClientError: @@ -421,7 +424,7 @@ async def apply_plan( raise if not force: if plan.current_resource is not None: - set_resources_defaults(plan.current_resource.run_spec.configuration.resources) + _set_run_resources_defaults(plan.current_resource.run_spec) if ( plan.current_resource is None or plan.current_resource.id != current_resource.id @@ -782,6 +785,16 @@ def run_model_to_run( return run +def _set_run_resources_defaults(run_spec: RunSpec) -> None: + """Apply resource defaults to a run spec, including GPU vendor inference.""" + set_resources_defaults(run_spec.configuration.resources) + set_gpu_vendor_default( + run_spec.configuration.resources, + image=run_spec.configuration.image, + docker=getattr(run_spec.configuration, "docker", None), + ) + + def _get_run_jobs_with_submissions( run_model: RunModel, job_submissions_limit: Optional[int], diff --git a/src/dstack/_internal/server/services/runs/spec.py b/src/dstack/_internal/server/services/runs/spec.py index f478d187bb..a18f151ce1 100644 --- a/src/dstack/_internal/server/services/runs/spec.py +++ b/src/dstack/_internal/server/services/runs/spec.py @@ -8,7 +8,10 @@ from dstack._internal.server import settings from dstack._internal.server.models import UserModel from dstack._internal.server.services.docker import is_valid_docker_volume_target -from dstack._internal.server.services.resources import set_resources_defaults +from dstack._internal.server.services.resources import ( + set_gpu_vendor_default, + set_resources_defaults, +) from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) @@ -108,6 +111,11 @@ def validate_run_spec_and_set_defaults( if run_spec.configuration.priority is None: run_spec.configuration.priority = RUN_PRIORITY_DEFAULT set_resources_defaults(run_spec.configuration.resources) + set_gpu_vendor_default( + run_spec.configuration.resources, + image=run_spec.configuration.image, + docker=getattr(run_spec.configuration, "docker", None), + ) if run_spec.ssh_key_pub is None: if user.ssh_public_key: run_spec.ssh_key_pub = user.ssh_public_key diff --git a/src/dstack/_internal/utils/common.py b/src/dstack/_internal/utils/common.py index 28becc936f..ba139c6bfc 100644 --- a/src/dstack/_internal/utils/common.py +++ b/src/dstack/_internal/utils/common.py @@ -91,28 +91,14 @@ def pretty_resources( cpus: Optional[Any] = None, memory: Optional[Any] = None, gpu_count: Optional[Any] = None, + gpu_vendor: Optional[Any] = None, gpu_name: Optional[Any] = None, gpu_memory: Optional[Any] = None, total_gpu_memory: Optional[Any] = None, compute_capability: Optional[Any] = None, disk_size: Optional[Any] = None, ) -> str: - """ - >>> pretty_resources(cpus=4, memory="16GB") - '4xCPU, 16GB' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=1) - '4xCPU, 16GB, 1xGPU' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_name='A100') - '4xCPU, 16GB, 1xA100' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_name='A100', gpu_memory="40GB") - '4xCPU, 16GB, 1xA100 (40GB)' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=1, total_gpu_memory="80GB") - '4xCPU, 16GB, 1xGPU (total 80GB)' - >>> pretty_resources(cpus=4, memory="16GB", gpu_count=2, gpu_name='A100', gpu_memory="40GB", total_gpu_memory="80GB") - '4xCPU, 16GB, 2xA100 (40GB, total 80GB)' - >>> pretty_resources(gpu_count=1, compute_capability="8.0") - '1xGPU (8.0)' - """ + """Format resource requirements as a human-readable string.""" parts = [] if cpus is not None: cpu_arch_lower: Optional[str] = None @@ -131,7 +117,6 @@ def pretty_resources( parts.append(f"disk={disk_size}") if gpu_count: gpu_parts = [] - gpu_parts.append(f"{gpu_name or 'gpu'}") if gpu_memory is not None: gpu_parts.append(f"{gpu_memory}") if gpu_count is not None: @@ -141,8 +126,13 @@ def pretty_resources( if compute_capability is not None: gpu_parts.append(f"{compute_capability}") - gpu = ":".join(gpu_parts) - parts.append(gpu) + if gpu_name: + parts.append("gpu=" + ":".join([f"{gpu_name}"] + gpu_parts)) + elif gpu_vendor: + vendor_str = gpu_vendor.value if isinstance(gpu_vendor, enum.Enum) else str(gpu_vendor) + parts.append("gpu=" + ":".join([vendor_str] + gpu_parts)) + else: + parts.append("gpu=" + ":".join(gpu_parts)) return " ".join(parts) diff --git a/src/tests/_internal/cli/services/configurators/test_run.py b/src/tests/_internal/cli/services/configurators/test_run.py index eb5027671a..6238bcb025 100644 --- a/src/tests/_internal/cli/services/configurators/test_run.py +++ b/src/tests/_internal/cli/services/configurators/test_run.py @@ -132,13 +132,34 @@ def validate(self, conf: BaseRunConfiguration) -> None: def test_no_gpu(self): conf = self.prepare_conf() self.validate(conf) - assert conf.resources.gpu is None + assert conf.resources.gpu is not None + # Vendor is not written to spec for compatibility with older servers. + # The server infers nvidia in set_resources_defaults(). + assert conf.resources.gpu.vendor is None + assert conf.resources.gpu.name is None + assert conf.resources.gpu.count.min == 0 def test_zero_gpu(self): conf = self.prepare_conf(gpu_spec="0") self.validate(conf) assert conf.resources.gpu.vendor is None + def test_gpu_no_vendor_no_image_defaults_to_nvidia(self): + """Vendor is inferred as nvidia for validation but NOT written to spec.""" + conf = self.prepare_conf(gpu_spec="1") + self.validate(conf) + assert conf.resources.gpu.vendor is None + + def test_gpu_no_vendor_with_image_no_default(self): + conf = self.prepare_conf(gpu_spec="1", image="my-custom-image") + self.validate(conf) + assert conf.resources.gpu.vendor is None + + def test_gpu_no_vendor_docker_true_no_default(self): + conf = self.prepare_conf(gpu_spec="1", docker=True) + self.validate(conf) + assert conf.resources.gpu.vendor is None + @pytest.mark.parametrize( ["gpu_spec", "expected_vendor"], [ diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py index b00d6ccf57..fef712acae 100644 --- a/src/tests/_internal/server/routers/test_fleets.py +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -344,7 +344,14 @@ async def test_creates_fleet(self, test_db, session: AsyncSession, client: Async "cpu": {"min": 2, "max": None}, "memory": {"min": 8.0, "max": None}, "shm_size": None, - "gpu": None, + "gpu": { + "vendor": None, + "name": None, + "count": {"min": 0, "max": None}, + "memory": None, + "total_memory": None, + "compute_capability": None, + }, "disk": {"size": {"min": 100.0, "max": None}}, }, "backends": None, @@ -467,7 +474,14 @@ async def test_creates_ssh_fleet(self, test_db, session: AsyncSession, client: A "cpu": {"min": 2, "max": None}, "memory": {"min": 8.0, "max": None}, "shm_size": None, - "gpu": None, + "gpu": { + "vendor": None, + "name": None, + "count": {"min": 0, "max": None}, + "memory": None, + "total_memory": None, + "compute_capability": None, + }, "disk": {"size": {"min": 100.0, "max": None}}, }, "backends": None, @@ -639,7 +653,14 @@ async def test_updates_ssh_fleet(self, test_db, session: AsyncSession, client: A "cpu": {"min": 2, "max": None}, "memory": {"min": 8.0, "max": None}, "shm_size": None, - "gpu": None, + "gpu": { + "vendor": None, + "name": None, + "count": {"min": 0, "max": None}, + "memory": None, + "total_memory": None, + "compute_capability": None, + }, "disk": {"size": {"min": 100.0, "max": None}}, }, "backends": None, diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index be78414a9e..0c5ca338df 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -47,6 +47,10 @@ from dstack._internal.server.models import JobModel, RunModel from dstack._internal.server.schemas.runs import ApplyRunPlanRequest from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.services.resources import ( + set_gpu_vendor_default, + set_resources_defaults, +) from dstack._internal.server.services.runs import run_model_to_run from dstack._internal.server.services.runs.spec import validate_run_spec_and_set_defaults from dstack._internal.server.testing.common import ( @@ -1535,6 +1539,13 @@ async def test_returns_update_or_create_action_on_conf_change( run_spec=run_spec, ) run = run_model_to_run(run_model) + # Apply the same defaults the server applies to current_resource + set_resources_defaults(run.run_spec.configuration.resources) + set_gpu_vendor_default( + run.run_spec.configuration.resources, + image=run.run_spec.configuration.image, + docker=getattr(run.run_spec.configuration, "docker", None), + ) run_spec.configuration = new_conf response = await client.post( f"/api/project/{project.name}/runs/get_plan", diff --git a/src/tests/_internal/utils/test_common.py b/src/tests/_internal/utils/test_common.py index 140627580f..70d12c8f39 100644 --- a/src/tests/_internal/utils/test_common.py +++ b/src/tests/_internal/utils/test_common.py @@ -13,6 +13,7 @@ make_proxy_url, parse_memory, pretty_date, + pretty_resources, sizeof_fmt, ) @@ -239,6 +240,80 @@ def test_make_proxy_url(server_url, proxy_url, expected_url): assert make_proxy_url(server_url, proxy_url) == expected_url +class TestPrettyResources: + def test_cpu_and_memory(self): + assert pretty_resources(cpus=4, memory="16GB") == "cpu=4 mem=16GB" + + def test_gpu_count_without_name(self): + assert pretty_resources(cpus=4, memory="16GB", gpu_count=1) == "cpu=4 mem=16GB gpu=1" + + def test_gpu_count_with_vendor(self): + assert ( + pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_vendor="nvidia") + == "cpu=4 mem=16GB gpu=nvidia:1" + ) + + def test_gpu_count_with_name(self): + assert ( + pretty_resources(cpus=4, memory="16GB", gpu_count=1, gpu_name="A100") + == "cpu=4 mem=16GB gpu=A100:1" + ) + + def test_gpu_with_name_and_memory(self): + assert ( + pretty_resources( + cpus=4, memory="16GB", gpu_count=1, gpu_name="A100", gpu_memory="40GB" + ) + == "cpu=4 mem=16GB gpu=A100:40GB:1" + ) + + def test_gpu_with_total_memory_without_name(self): + assert ( + pretty_resources(cpus=4, memory="16GB", gpu_count=1, total_gpu_memory="80GB") + == "cpu=4 mem=16GB gpu=1:80GB" + ) + + def test_gpu_with_name_memory_and_total_memory(self): + assert ( + pretty_resources( + cpus=4, + memory="16GB", + gpu_count=2, + gpu_name="A100", + gpu_memory="40GB", + total_gpu_memory="80GB", + ) + == "cpu=4 mem=16GB gpu=A100:40GB:2:80GB" + ) + + def test_gpu_with_compute_capability(self): + assert pretty_resources(gpu_count=1, compute_capability="8.0") == "gpu=1:8.0" + + def test_disk(self): + assert ( + pretty_resources(cpus=2, memory="8GB", disk_size="100GB") == "cpu=2 mem=8GB disk=100GB" + ) + + def test_no_gpu(self): + assert pretty_resources(cpus=2, memory="8GB") == "cpu=2 mem=8GB" + + def test_gpu_zero_count_range(self): + """Default GPU spec (0..) should display gpu=0..""" + assert ( + pretty_resources(cpus=2, memory="8GB", disk_size="100GB", gpu_count="0..") + == "cpu=2 mem=8GB disk=100GB gpu=0.." + ) + + def test_gpu_zero_count_range_with_vendor(self): + """Default GPU spec with nvidia vendor should display gpu=nvidia:0..""" + assert ( + pretty_resources( + cpus=2, memory="8GB", disk_size="100GB", gpu_count="0..", gpu_vendor="nvidia" + ) + == "cpu=2 mem=8GB disk=100GB gpu=nvidia:0.." + ) + + class TestSizeofFmt: @pytest.mark.parametrize( ("num", "suffix", "expected"), From 94a7a9f168c24b328480316f704bdcca512ce4fd Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Tue, 17 Feb 2026 13:54:44 +0100 Subject: [PATCH 142/187] Add `--verbose` to `dstack apply` and enhance run plan output (show creation policy and reservation only if non-default or verbose); replaced `-` with actual defaults (`off`) (#3572) --- src/dstack/_internal/cli/commands/apply.py | 6 ++ .../cli/services/configurators/run.py | 7 ++- src/dstack/_internal/cli/utils/run.py | 62 ++++++++++--------- 3 files changed, 44 insertions(+), 31 deletions(-) diff --git a/src/dstack/_internal/cli/commands/apply.py b/src/dstack/_internal/cli/commands/apply.py index ab73ae2f2b..c10f54f33e 100644 --- a/src/dstack/_internal/cli/commands/apply.py +++ b/src/dstack/_internal/cli/commands/apply.py @@ -62,6 +62,12 @@ def _register(self): help="Exit immediately after submitting configuration", action="store_true", ) + self._parser.add_argument( + "-v", + "--verbose", + help="Show all plan properties including those with default values", + action="store_true", + ) def _command(self, args: argparse.Namespace): try: diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py index c4c4d3488a..0322cac229 100644 --- a/src/dstack/_internal/cli/services/configurators/run.py +++ b/src/dstack/_internal/cli/services/configurators/run.py @@ -115,7 +115,12 @@ def apply_configuration( if len(self.api.client.fleets.list(self.api.project)) == 0: no_fleets = True - print_run_plan(run_plan, max_offers=configurator_args.max_offers, no_fleets=no_fleets) + print_run_plan( + run_plan, + max_offers=configurator_args.max_offers, + no_fleets=no_fleets, + verbose=command_args.verbose, + ) confirm_message = "Submit a new run?" if conf.name: diff --git a/src/dstack/_internal/cli/utils/run.py b/src/dstack/_internal/cli/utils/run.py index 588f74bc69..ada42b5e6c 100644 --- a/src/dstack/_internal/cli/utils/run.py +++ b/src/dstack/_internal/cli/utils/run.py @@ -21,6 +21,7 @@ ) from dstack._internal.core.models.profiles import ( DEFAULT_RUN_TERMINATION_IDLE_TIME, + CreationPolicy, SpotPolicy, TerminationPolicy, ) @@ -84,6 +85,7 @@ def print_run_plan( max_offers: Optional[int] = None, include_run_properties: bool = True, no_fleets: bool = False, + verbose: bool = False, ): run_spec = run_plan.get_effective_run_spec() job_plan = run_plan.job_plans[0] @@ -94,36 +96,35 @@ def print_run_plan( req = job_plan.job_spec.requirements pretty_req = req.pretty_format(resources_only=True) - max_price = f"${req.max_price:3f}".rstrip("0").rstrip(".") if req.max_price else "-" + max_price = f"${req.max_price:3f}".rstrip("0").rstrip(".") if req.max_price else "off" max_duration = ( format_pretty_duration(job_plan.job_spec.max_duration) if job_plan.job_spec.max_duration - else "-" + else "off" ) - if include_run_properties: - inactivity_duration = None - if isinstance(run_spec.configuration, DevEnvironmentConfiguration): - inactivity_duration = "-" - if isinstance(run_spec.configuration.inactivity_duration, int): - inactivity_duration = format_pretty_duration( - run_spec.configuration.inactivity_duration - ) - if job_plan.job_spec.retry is None: - retry = "-" - else: - retry = escape(job_plan.job_spec.retry.pretty_format()) - - profile = run_spec.merged_profile - creation_policy = profile.creation_policy - # FIXME: This assumes the default idle_duration is the same for client and server. - # If the server changes idle_duration, old clients will see incorrect value. - termination_policy, termination_idle_time = get_termination( - profile, DEFAULT_RUN_TERMINATION_IDLE_TIME - ) - if termination_policy == TerminationPolicy.DONT_DESTROY: - idle_duration = "-" - else: - idle_duration = format_pretty_duration(termination_idle_time) + inactivity_duration = None + if isinstance(run_spec.configuration, DevEnvironmentConfiguration): + inactivity_duration = "off" + if isinstance(run_spec.configuration.inactivity_duration, int): + inactivity_duration = format_pretty_duration( + run_spec.configuration.inactivity_duration + ) + if job_plan.job_spec.retry is None: + retry = "off" + else: + retry = escape(job_plan.job_spec.retry.pretty_format()) + + profile = run_spec.merged_profile + creation_policy = profile.creation_policy + # FIXME: This assumes the default idle_duration is the same for client and server. + # If the server changes idle_duration, old clients will see incorrect value. + termination_policy, termination_idle_time = get_termination( + profile, DEFAULT_RUN_TERMINATION_IDLE_TIME + ) + if termination_policy == TerminationPolicy.DONT_DESTROY: + idle_duration = "-" + else: + idle_duration = format_pretty_duration(termination_idle_time) if req.spot is None: spot_policy = "auto" @@ -138,7 +139,6 @@ def th(s: str) -> str: props.add_row(th("Project"), run_plan.project_name) props.add_row(th("User"), run_plan.user) if include_run_properties: - props.add_row(th("Configuration"), run_spec.configuration_path) configuration_type = run_spec.configuration.type if run_spec.configuration.type == "task": configuration_type += f" (nodes={run_spec.configuration.nodes})" @@ -148,12 +148,14 @@ def th(s: str) -> str: props.add_row(th("Max price"), max_price) if include_run_properties: props.add_row(th("Retry policy"), retry) - props.add_row(th("Creation policy"), creation_policy) + if verbose or creation_policy != CreationPolicy.REUSE_OR_CREATE: + props.add_row(th("Creation policy"), creation_policy) props.add_row(th("Idle duration"), idle_duration) props.add_row(th("Max duration"), max_duration) - if inactivity_duration is not None: # None means n/a + if inactivity_duration is not None: # only set for dev-environment props.add_row(th("Inactivity duration"), inactivity_duration) - props.add_row(th("Reservation"), run_spec.configuration.reservation or "-") + if verbose or run_spec.configuration.reservation: + props.add_row(th("Reservation"), run_spec.configuration.reservation or "no") offers = Table(box=None, expand=shutil.get_terminal_size(fallback=(120, 40)).columns <= 110) offers.add_column("#") From 42c0752e18322fab38582431941f48b2ed2a6165 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 18 Feb 2026 01:09:05 +0100 Subject: [PATCH 143/187] Cosmetical changes to the home page (font; headline; etc) (#3582) * Cosmetical changes to the home page (font; headline; etc) * Cosmetical changes to the home page (text; etc) --- docs/assets/fonts/GeistPixel-Circle.woff2 | Bin 0 -> 28104 bytes docs/assets/fonts/GeistPixel-Square.woff2 | Bin 0 -> 28616 bytes docs/assets/stylesheets/extra.css | 101 ++++--------------- docs/assets/stylesheets/landing.css | 40 ++++++-- docs/assets/stylesheets/pricing.css | 2 +- docs/overrides/home.html | 113 +++++++++++----------- docs/overrides/main.html | 5 + 7 files changed, 115 insertions(+), 146 deletions(-) create mode 100644 docs/assets/fonts/GeistPixel-Circle.woff2 create mode 100644 docs/assets/fonts/GeistPixel-Square.woff2 diff --git a/docs/assets/fonts/GeistPixel-Circle.woff2 b/docs/assets/fonts/GeistPixel-Circle.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..100da9fb0a305ec979a5bb798ad7588a18967ea1 GIT binary patch literal 28104 zcmV(@K-Rx^Pew8T0RR910By(s5&!@I1mgSv0Bv3X0RR9100000000000000000000 z0000QfgT&2SR9T-24Db+E(mN1oD2~N3X-b~knpm>HtM#8UJuI1JxKXkWAYP}a*57@5F5aX^!Ls;1}PyTt>-@~`|x$lFl zP!iZ29SVSK&a?@I2t`$Qh_-om|D=eG*vJ%70TmS#3yj160MWVM-mQ)<(PI-p>dl zY>FO%+dAc^6nMUDwfB3DSyZM7tr5*!yDr)TB9>w>?P4QLv&*hJzKgI-yb-rXME0ft z0{;Is-|l^nC&QTV$NUmb0U)mF;t09O&U{%}zsIkW+us1!rw7gnLW>fwj5o;5qn55L znA$#?;w<;Q3rBZ>!_ihx!@~!h7KdOs1d|B>MDX<5QZA~56&Pj0_jCa=Y)D55pB>T! zCEwab?V{eU`r5GR>wI>Iy3Cb*emFuC=`SM!%$9C~aBs8hwdEqpcSEvgJSYb))QJ&f z)QHTg6I)-;^U{qo8b^>(M#s+*8IRy=M+E|#W3}V0b20V(*4t+l%jbWo1#227f6*4q zYrM>&T`)U5$f7Q|JAk&}x?s)&EXShLFnNGlOgAV$Gi@ntT)XSW4{FgB%x==P!WPGGlKC+b?n<-Tieh~l13L<6E2buCjel>li^51(> znts~3vs6OwWM-rUzz~#-_Wq7X+VsvUOrYIwlg>(IBskE=udTjWIbeT+ROwAET}P6J z)`0J8@BdGzHKZ)f{9pxX6d@{{dUN`mV$;1ZDH`R$-CqU#f9>o03AGbVAYDdvV;xsA zNyX1vc_Qkcy_4xR%)&KB=z5IxQNt)KP@jvp0!WQPhkZ+-G5C>6GOvn2DsLRn+`)gf z_bUOqZ@oMdLsRja$v{|08n%p2XP(U1H+5^B?GSc^fY&4U7mBS+rH#8z5O0=Zdo#>u z{_^`d$z8q*7Lgz!1PMh#kjy3`A|g^kM~X-hks{8#t(3lPTmRq3=i~nkuV;TRP06%D zA*E0X@IY*TS^h9AcYb-@_5#DlFnkP$;V=xtFbuusqSby_H8u?kS|qQOE+k&s{#Oj+u zIO{E64P2aeOOseTx{V~fF7r4~S}HIR7Xg5UlaKYA8$Fn%HL6W}XWmRUYSg!8HnIj_ z(>^ORH3Qw1cbk-II!d##nbd<(6roKJOx+qA+&d8fMt@=La!Y9xtrE@1E2&M*z^GzV zJF-zNTW?t-umN_#J$QxQ-`a@(J0l|6t)T(ioCGIkE&&HslrsqS*YiTCl1TyQDn?MJ z3Efab`B-BeUzc4M;GSpLzJ-Ixym=>yPgoL!266)IEtnHN5w8cxL2Gi2)*<$VU@X}N z+hyXgGfMA;)qa)g^TEp^!ucgc24b*DOgkg;0(q7EP*IJQ3O_0Vc|L!-ZjONb#Njj2PaZTPXsqX zs5EEOX`eTCfe3b?fV3XETc{5K@iiLT$N112F2fq0K)wNn`v) zR4F3)l65T~3Dj~rGmc+ixkwT*R^*Mn+B&tSB2VIVHhtlmNA7qenHO0a z(jBbE$e0o#JMm?tZ&sr}e(AyY=gGynk3}4PXeD@6_@;ijYb}9Ln&1#To$g!@Nbv+s zsH&SwI17&nb|pxzMD`Mvk1h_)dEDcs5TzCkzmmKWTNacpHe}%@EH-em_X-XIqwe~5U7m3UVHDiHFFOMvW9AOMzbx?L32ZuOH0cUR)ki3q|2)R%~>*syJ@ z0@*}E+u0mm8Ih%+gmdv>!E`4=ApJU6c*OaYkUR^jG05Iy90qiQr8BswE|1FgDJ8#RuU;3hj2!Az^H_BT#-u)a*}MBDq1 zG2t;_VR#0|hL><$cmv0W_pm5@a!rZhtM{#O;(vfMKJgB;Jow^RoPPkZ!+H=TxrO!M zhnNu;I~Bwkv}3^Ge&M=Z!le}}Lb5K-QUz28Tu_k=U#zAQBG-GuK;GBnaQKV$4Ry-x z;hWFWg}MuM7wRq~FVtDcUZ}HB_Y~?5h})hPxU;)E!AVYZ@+=lRO*{6O0oZf;VXtY! zjIm?yv0~;}unz!K+5=@R2=39jev(&Q*>?k<7%C?;&gfjYa^ubeoX!Z~Y|Ry%wkQFR z@cIPU64e+Szyj^ih~{bN-yZs!n5AQA%+rkRuMJ6X9-P&o89&wc$YV45Nvw?BIj)M< z7e8mN4o^hvYuQ`g8^KV+^2T(&1iSBivMHvTX8LtM95FtDDA8gF!@yY?9D65Xrxnad zHQzN*dxLW?vc#rHIkP{ZxfI1{V~jPd}tzQ0?97l5WG|_YTUvgWLrod!``py&C{p z0B8e1TL93O0QI8qM16(rQY4I8E^fFHMjADNf#c2(sf7ZQcoNG^5y<90S$mSOTs$jZc1Xw+`i)lTX-rh)`j;!XdB} z*=JzvvMa8R@8o(Jp?v*1NBId2<&P68Kv<|i+)zQnL$O4J3KkiPjTg#Wbf^+}-w+kB zM4#znp{4lvfTM^&3C}X;&R@3}2BP8F=4?7&rv(;TR1?>Fx^Ot?v>d)D(P9Y2iW5&n z+(LPdaut+%c1MO_jr*^=L(=7F-ctG-mpNxzZy#jW!`y4D!mw(|ngQXP3E?N_T?Ncb z6Q}!L&j`5;iUI)vwgGhuJH{u>lnB5)KB|BV$&2rngr()({kyStWK#XtUd*k}Fp@`_BBpRHDNMfmS71A4Q zvT4pu&WNYi?3v@!x56&@F_ zZa-UVUAE1h+H*(jE%q_{%!>~1`i5Q0)`xDcpBnl9pTGbAp8px(V9_KN#~_eMMy`ah zV%yotyz^tzxg6M*7**!L9?ZgN>Am;_s#N7FOLtl(n#qiN{@NOI*j9UzdaCH)A-VW= z0N&{TU&vmN4PU9P&T6YCFXwx}|K2K1DGe#{0RHy#0F=x3&to2LAC^5#d6*6U3;gw- z2O0qWy3^p^;LgBbH@!UYm#3)6`^x(X0Q_;sp3AE{09!x06&ut6crW2FCw;1ONsq|B0 zvb6^3&&Z_Obmg?l)T*E~&unv5Y7cTo=ZUF@5E1yqB$COb%TcOKIh7i9-@SqTng`ZM zqsbEWuKe_8tbwsJhyzb&<6`t~xcQBs`Rewo&NsgzKq7R4H0aK3j0TG+^fJj*9i|y; zh)d2Av0JRpFn3(EK(P{6y)l^DT^oqCk}FWCcO(db6Cx@aH-UWk@fZ7o^NH}5tw63k z#qyh35hI;?25l6+>NNYJ{ohOTEwaQ)TWsXe<%DiW9kbqDkKFgbGY@;vQ~$wlfBnlD zyeturEr6MWVj&mHxm(4(`^5LWLPj z8kt1bjPMBv7H&UVRA7k&tWiL)?fU21fZSNwG_&ZV#W>?tnP9lh*4xNot8KQ}V3RH@ z&CqFnLt^Avdtb-qk_rBZsR;Gr5OS9JMkQBabnyO-?xZ^a0q)YoL|*=b!K;^uo6m_; zb1a|ofpo7PAzuC~>6Yd~_nh~@6F?pJ5QI4f5VZLqK%icIfI`}zbo|&-91&1*j+Km$ zVx6_!fbjjHU>by@(!wqeds3+|N4nM-58Af;y7c=Gq4OI86?u`@48RhLKgx)*xnN^Nb2Qvv1kV9KB&J3SaL7bpURptNo{$z%7M=G7=71Q2>QEZ5vW#Lx znqax2h+LIh&yxkb;XgisjfR_u(*@*+?8ERYTktYCV|PdN{@5NfCQR@9x5^j zq~0%WQ3bGAAmstr1T(S`&=3lxHW&Yxl=>RK&}Rc~9@E~DfnE;^C-J%;D>U+tnPs}( zIVJ@N^DL8YQb^ke2j<%KEZRBxa`r>LZjJpf#&Vlw>IxtfMmEXx7 z_W=i$RQC9^7zbQ{Sp0uzi%boP{9X{*4rX1XAhzg^l58Zy+HdsWwWB?%^s{@0(kKm*V)aR+9 zFvC1{e`_iuZo{j)G{$lkh-gx(I^A$M!-Yl+=wmhA-EIXrqGOQ2MvozIKW-70DpdB~ zCwz-C!M2EH&>SWIO{bS(Wa-FKjIlB(+%vGKVfQZoq`3a~l_B`HgVavyUNnyUr0z%Ok>8hFizS>?`Uzn1 zn@xbPWavO)3zb}1Jtw+ftaj(#Bre5Kqw+8@NjHtUIN7us3`b8v?qxN2 z8cU$g9+^CtNLHcQ3zPpDY&L$R8}(F(YVdM718HPd(zg@9vJp6o;fW@2&dvzPg_|$v zc~l~kmt-h2z#_z-cYhz)OpByF=MA*jp9?r~^m^a|fJqXIFo(JMJFlkfFydgpJ1~)^ zJGbiKVy_G%oP@CGq9D9GYPQ*5rJqdKTlbU2_1xkCJYi6}iLHK}=C~ktv6)Xmc2m6t z#@s5LrR9t;7KbsjAkb01sJPgE)M?ppioq$wF&O$Bv%r=fwRypejKIA}1C0x?n3&`S zmFov=1&Na^aH&SOCnE6JugyjydfCOW=$?WkLI&G&#ef#bIL|Tj-VxlPhb*j!YIzi&Y`x zyRJ$jL*0m2dK=Yf_h@gcPy0W`Y_UncC0zFy{lM|IDnRe+0I*&2#SstNE zo}dSrGT^BsEL0kYKp{nC7&w4(DO3@MKq)QliJyQ_r94!gdNqhjuig_Np!QVNAgcYY zh75aFJA>1g6l;p;J5Ffg29xWaIU;aNg&qA$7sQ^f!19q<2$xt>+p_L6$%&Ef#nf-cVwU&n8u4Iq36 z6)~9z@LSYRT66>kJ_{07Bkz_&qwi5SkLG`X&GM`FVqLJ*WJ&A{F6oGAhNnKln1?yb zA8xs+uDL*54vPz|vDO%%i@El?yfM*4OU#a*=zzhY`l#9PD$7@RgMmTW3BcYfEWAKJ ze>duHN`H2D##2d`43$1Fttu6Ur>-Do^yE~mzTlpMA0S0e{OZ|cG}4^|8c) zmNOfuMJEZdC|q?(0wF}QEh6rAY@jy1ZH$K@v6YXOD_9`g$owOMAN2h$7e~uVp->Tp zX4A+O_=Y-w6v~uT&nkEDV~8jyd1rKZG_Jp8dSv8Zu*goTqyI{x0odAm!!&%rVfzyr zdeRc?^n}+BBnnwGASjs0fHKB?1atNqB%A2mTQIrzp2Q$~Mu!Vw6vq$)T;y~8UgMUc z1eCo*sg58MouJ*~2?%!zvkorU9A24)1U?<;c2uF`fJv@!W=mGAV=hT270AEF&C?3CYB@8Vow3<@Q4J>! zIR@my?AsY3?x+Kuz@$!uF}dAOt@B)N#nGQfGqjO7VFqEHm>I3FB<2hcF}IW(TDlfHyhS?9VDbe+!I{0C zmb_(;f*5x!8G%$EMCsR;FJHw**E=_`vz;xrL+Pw`sU8p~98M!=O&@C|kVwn(QJMLZ z??LD8ha{fPMh);S7OFdxdD2tHJOVrgnJMgKSfTZ)(JzVW5*$x({!V{A;5{&16`d`d zU`ZMJs_C&=p47BQMj>w12FQ@P0$9IdM5BWJ^h_GR0YUWcLtbK1fl*9kE&#vt>eDg! z$1L^Hjy}5b-gEW5a=fgc(IiIJTg;?A9XsrNo7ZwO35*p0Y_oXI-r0;8JMEbDA!2M| z^_8pfp6mGO%xQ<6%KweR^-RB`XvF3vh;RxCM=QG zhCBZbi5kOqASWWn$r_0;Y9ui>k&$;&TaxOMeiaVB-x$@u>WlH3fceSqbLb>5##e5? z!gaUvKgUU2kX8-^pUU&f?h+GxUW`4m&cMUjTm(2VJ3tz9vYYY3P(_l+UIP>})9Z{| z^#VXW^nLNcDvsc0mSPWLz$mbyYqUX`us4C=U5Y0c3A;9o>)=?jc(43$SO~Qht$cEhfkctQ0 zbaL6|+Nr{vk5ypEHd!ddgn+@W&58@5igRzW0YX1#Y?@P4D1NOdn=hgHPMQ9YOj375 z$ww7vBZ(=uK`5+u2v47op@?IF0_YgiA#0-ZkvZ$DHR_S?uGZo<13Xj#6@aS8cZVTH zgu~!_gy5p0FUk}0z4gCxCDVB|T){&p1W`zBo>*z2<_QUG&;Z*4LPQ5j>}X0Kc})Lx zKI7L%e@;-lTg&cO*oMxK#T&^fqZ#4U0)@ACxE5nY4SfMNV6)D@G0m3!iMn%g14B^U1ohhBl;n0huvx$=!Q zoey4ND>oW=L5AudIxdKipw4F$u8qg+e|-&JmN2%OsIdN#8b#8?%2$xSB(x)s2$qa} zTa~1SNeEPb24Y$yrNV4}z{r3xAeJOk9APCOe9$j~Ad?I46NZFLe2|R;EWSkwPPu!b zSV7Q5SO6EV%iM~BS8R4+$iO2cr%BdILmWm7q0yvN%E#dO!HF=)O|yr-h+o-uai{vK z4BdnVPr8&0^FNB-k!>*)856^R)#c?5(dsd*n~$cPgR&*tZ&;UGHv z7tUC^Vg+!z08eCJ$HX?y9Pl_K!n&aGTX_Y**)bHfdX5;YSIs5m%t*JYMI%AJ=5b z#1YlyTw8vx4`mJ4Ai9|o3=U&34&1L6?-5FzFMZ1&VI&p7?4vs!^?_`lY48eP2SE}m#Omj+zEx&7nn!Wd`+8k2k@afoq~qy<}eRGA`EQ~U`xSAmdWMB-;;ypV-9(M2+* zD0%VtV+NRI^DZznmqU*2_MM!s{3uK_Y}+WBL=!2ok8c^i+x!liMuFn|2Jha#l|R4* z9(6|$C&wR5EE!ok3N>V)mHnxUkZPyEQEy6MjS zG6S724D@4jt$D zLoq0PfImBe73dZu!Do`LPvU+-L|iCRoS0+vsBrEDnjv>sIT~BKK98eN61RTbSaMUA zpi7>X2pMBynb687xLc@DV$kvT#%LXQQa} zq!|QCE*vm2B8ao>V;+v?6hGV05EZyN;1IyfDKBY0mOf4U!aHbqL61;zz8a&-x zC;4drUgT*3TaBHs`b-L}D-1dGjsB`I0|vgw@fz?E&ML12CoG(ZHmZ+Phb~4u@Rdl0 z4R54c2UkLT;~;l(G3&(j$mA;r2?-KJgfY!Ydtfk5Nn#}2A#UEW{O~B>#nb z`&*+kgfOX#Dfq?I?qRhqer{uGbIH4CbZ$xH=3H|KOPf#V?Mo($#lDopLz6reW=xFB ze>4OH4$Uay8MFzwm&}|GfIn>3XUOV*-o8n`CE3}<66x&w#^ndR@Uy`vd4rLTi033s zv3zI%$u`}5j@`i3VYw2cP=4pC9?lg)XROf(Md_$82l9#)mmN`Lr(&F@L~#elfK`bu zCEc$5?Qp<+mNTP|Nw-%G#zk-P8qJ)X7HG){eR34Ik=||LClXv8@W_^+d_bGKZ%>8S z{BVY?67)<*)zpNchMEMU6Ydf5WNB`S8*4#dGZhfAPhq^96++@qr{HRCV6b*$A^stj^ndii8=Y2+Rd9WJN#&M8^rM(yYgN}XHIP2Uuo<}GQOyh%2Wh8pC|W=V8cyb06pRml%a3ZPaOt`eB&Jj?-+I`f(V7Wd|_u+AwtJ(D_A6z&dBYV zbMQ(tV47Sq{Ik)o{6|x6D+EhsDEMtP-0ypZnIgI2FVac=Ej-Jxw=4eY4iKtbri2*; zT<<%fzdsp^W>678Gp;u`F7iVQoUl<7P4E@HS7;m4UZA$CnIgF7=L5L<`F!hVQ-QY` zI1XK;_Faq!cR_adPD}J4X*DR0w2Ls|duh^iQ#IP3MtiXAp1C2ay~f0d0Vv!Vs=8=3@q5QA<2^EwGk8w74@h`bl zi@%SqJ2*xBMzQbF9gmyT1hR1DR5Prg4YQM}Gc5 zgpM;}C4HzB4h=r?`+4vxlS|Wu*9t}N_wl*i`hUS?;9p+njpMVKc z8SrH5Td~*=IV!C&&duum`+xZFZ_?@c+0n~|-Hnd^uaCp;fuTLNW;(0kFqtx)w7~1? zL_D5!$sYA#G_c~icHQ13DS%F@{4qZD*3c7 zH;Qsx-d|wz$k86aNrY>{JGcqin6_r%IK(qz8cp8ekqHc{i1z;$tpzANjw`Kc;iY60 zB0~<$P{-0}CIiT-nxq6QQy$>wm=ZlC3aGwcdeeJ-7Zu zdXE@4I$jDS2#^Jn+VBZS^es$WSoL*GO2$aQ<}&Y?Tv-h}DP;V(VdZiF?CEA)46e}4 z=Q(Pw4*ii=sAPq;TJ)aKHZ%9C(;(15LwQ`e%>5T$_SZK?)-AzXJp<5Q=8D-@$Tz`) zcaZn@H53QmJc}+MJ5aD9L=6R&B{kiCG(UxODesLSV9YAz2y6KB{CfrYI%(?tq{73U z9GXlP{Eh`jPMt$izD`AOH1|Z@s`R1g?DDlCBxo_>LmzfNuc>befU^p~wvbr_pwB8z zr)$Q^!-_(Jccf3Chb42ww`PYwO96oS8i6$;l}O(jEirOeQ@P7AEGf*Z7O%|TIoKPN z2U01@1!`unc!cochwkzwCRMYT>3Du{>p=k-CoKxUpy7k-p2j8G@IfPO|2DlPYS>5epe{_Y5QtD|)1??o^MFjjdK@$}}ji+!_xt z@=GIdycDp`Bhv4Y%lBJ{+X!}aC+l8T3q%-@;^6yLiW<54$v<+ASzNC$8FM#(cbHwaE%4D!YF1L@I( z&xf(C7wZ`zRU(*Cr<*sepC++AX=EL4@@7WY;|f=)915;bO6kU1-)f)@@s8>M)Rc-5 zN~-*mmXloU#^S^qE81Ljg9?6f6=oGFB?CQO*>Kn-!)*`<-oCH3`LO-pq9$#E3*@Hl zZltKEZ@k1O94F-Psf0|-y45r+MszbuxT?hT)yScdPl#x>t&u;&haZe#!dvD_!zGCc zJs%+={z^7lKd;53f_#{Zk#q^E1vzSvv#iMGo5^>BX9BVw(2PhNcA? zsyp-rx*;n~V*%A@D1YUYmY<)$MSoCD*jQq!%57_S9yJdxo9vJPIB6o5jAdtNcf?)D zAvxVzh$7ZxKK{fmyTA_t-{IE?09QE?1&qjZ()o898~1XQkfvF>RMmlhPGlGl6kqDp zMSQ$~BvG2!bJjvvDn4LHjk^hZ}CbFAD-`sEc$$`ox%;P0rsofrh>xmdp+h68+q zCl;a(H;l~MD-DIURmDcEXe7zwahnIo9EZwZQ4sm92sGqLSRu5tKyQW>dh;#HaFMZc z?&mPPf-g~W1AL9Usv;&$`4bN^dqPme{fEG4-!SHukp%5@=aF2?!SMY7^co@CxCiA% ziGmFyxsV*-s49U71ev!wLf{JX=pSzE^G=!*IV5ehRYR6{4TYyaf3%Lv>W96A-2R7C zh)dD+;$j-S8Tu!=@%5D(UD`1}ei*u}s@k0r)fYWQHyG1A*arkaJ54E1+VC(dpicU5 zEiZu4f&pWVd~gbbYdQZkozhil#(e!Op};o`0w(gB@Xp_4`Tb}>q@Bli*kpRPy|E6@ zFCA%|PW=;sFB52$y5OiX53Mx(EuYKUunGR#HevkFm*|Q6sSD=?B?GksHO72eOgFTe zp*uh+?L7bMk{eG7^ykwIW&A%|1WRr?9(dubkjKp(S05CqbQ|pF7%R0N#wDAdPT@gw zzPfhjnurnYf|~N)YRrL|#-NNjVL{Jcd zXNKf+LaPu|SXKbhpW-*E#t#XkVkDKH%{)pGZZl^zj*@oHL^zB(-@jDK+In=Dnlmb{ zyzr0yKALJ-k?SJUaQ=br`$9q-6(g9jbChLa27lGsm00TwCYaT4 zk`o*7J#O4A6OpBa{+~5{uhSQx=^2n82M1HON7dd!w&i(Te|eMdq57iNdg3cSY;sRu zE%}xF%!O3om%sVv!8SGx3b%{lpBxjY--Kumt2H`(S6D_+4{pX{;RY#;Y*`W7RN^3O z3*G|sG*{*Po&H}0ILc6r1eh@0KlbU_=&IzU!uerOc$_K;cAXS{2Zlk2>3^`34TMvjAV;r$w8?fq%Sn({irA} znzu_Wznr3_2IYQoq2CdDK97FtvgsVQx)FP`qPVLfSx?Y{dn2}Y#Edvk#_3A&8$Utr z4hw|6tgFKia{@_(`sHi+F))cEkC zvm8%{Y~F8iEshxlWlDnq2l$yx7eMx4O`B_2lg$dVs z#!m&w2#S?K`qN}XaT^N*mo}Ba()I2*q&cxljYBroRCZIXLkxC^p5`RuL8pF`@-0hV zj@}?(;D7C6FCt~Vki2lNDqnggkZH~(6T;X6xan6z>`XoNM=^WfQ zPPnNHNomjQlmx3ovt;r8#~BWJwg?Vy#&8EkXpt{3OvqW=KY9jol*JuD|BB%zFnzddF8KWwx!698uppl6UKa`ofv{(&!$7ogVIK+^aVA2?X+VYq_<~;!s zrd}9l*gxp>@dj%|nQ%g7w_jtfm{RUpA3G)Ew`R>Cs>B5^F}+1D^&Mi~ZlM$ZL3tCF zj#+!oq>Y3)EBU^0pko4`wf9MTd!&3f3q!y}k((3CINFzKWQg*ei+`v~95phaWM& zWf0hGNtUs_95v2z&HS&ieY*Jx(r22}f1la1ilJJb6PmQW6;n=pWF%?(JS^vi9mJzj zCvH(F%a6n(@7Oz0U7DdH*bzHyJEeWQ-JiJQ z;pnLSJ79Z8!!gU|-F0}hU+^qiyZLjeGShOMD5=_$7=r{9ccp$_67eJq$pptc?}@`;Z7P_6wW6vL@UY?4h#a$4qH0`{ z)CUN*G?5(38T$ntq6AuA`wvQZ`mq=4b+U0ET`|RS1t7_+-XRy@c-ZBT@tAY}=bp;K zC~$!9+<5CTkE9sZFocdg$dSx}^I%f!E6RIARH&l1J4_fE76Q~hz<@EO?n@#KWqQlp z9saE{d__DB?)0NyPDiX8OP$0EtUhaS9xxJA1gV{dEV$!D_h4f)g%Fc$2=#GzoM*`Q zv91w|n2;w@)Nve!b;{5X{w?S4iissMR$kXB3KE4vzx|{wf)0BjVC{j^0ZmYLe(%o6 za1fFsvjm(De1f$*{8Nu^p_xm&U2I21tyPys})VvsC*xSl%{eT#U= zd^(i^$LVNx+%A(f^07vljCR6Vp5J!#z3EF&v!6NqEd6bJ3aK1wxl zw#{IDf51H9Mv=1x@NPd+vU(7gVtN8;84DCBH7SF+n|14t6VyVI{+$6zC-sEhoruNL z7bGVjj?$ld3Up0r{p?;nbT*tU|3?VFqFRx(=l}}y84>}bJxI#pQUWK7Wvnw&u{!LcKt z4f0BW3r9q%7$og3%P(DR&{FNyHYC-8n%D_-E@qv<#%6;a>Boz@@; z>xp>;H^~qyUNfL1n19IOhmN{C1n)F4TNw*4yM@9(I1_XX8g18J?RpL=yL zr*KUuk;IdOSlE5AF?=9NU36Z(YJQF zAptI1>nYc|Et(chp$iTU2zLem0|KSBr!ChN2_5gY_3UTF-JQ`8QQL1$E^nAr2BsQS zRT#E~ES8WGJ9I}CSXN&l9eZ|?pQ^aCwIh9|=KVEWoR*Hj!6r5fXS$TX=JPZ}0mHk7 zd>#tAD0&2L6rKd}0@wHhAelt8hE+2k-;em)I>u1Ab^rd)omXlA)}1F}Kn<|#+x(wO z(1d-%NfCN*5WqU2fGdL2yK(u)P`eOvQXhH2{i*KY3FHq3!g?%3MPl~#LbT=$OA9Wg zPVVQE=IDPYU?)B>VwX=^2cw-w;7Cg#*L$=^HpRjsW-6k zmC05qwZG#?uPqW}ov*KKG^B!L-~i9wEn0A@4ibDM+XUe~VJGvFX^<1TMi2$+WX8^d z81FbwHYSS>lg0>!#amg}`=R4z;^<&Wglz{nVJ9Hm-_(~3-}1!!^vQHX+^V>Nwk6!S z#X8;uAN~^XQv_P5T@hU>@zM~9keWl?w(wIbT2QV^A{nQ^Ttx(gc7GF#;=T2Rfe;yQ zEO;_7(@e|62!tUMtml8GS0QPJ^BeJ`hWYDc{*P5dTMCwBO|vRtG&w>tx~>(Z7%LC5 zXf!&bNewfFV;JgR!J!>`T<$dci~)X@=lTj?R+EJpn%wTa;8rkWr&LJ#Y1Lbr10Fv5 zju+`l=D0OyXn1#>3bOAq9>-o(Zhp(9ogQ@rA4KG6jGLn#%@s)yf*h&)ThTM*#&7gM zYhXFvMg$!J;r^Wr*WB>X-!sgzw#vMMQecuq5wCU#0?!XQjgY@%>BR()9K?M^UBByP8H z`UdXtEi?*g8eE>X#WtGk-p`Kj88o!?ZF7fIumXB@0?Ef)LX{q+2#oFz`*wL6f zeYieHt&Q>uM;+=q%y@mO3f5IQ_i!lfiK=gWN@l(F;p~3PuVho`R z?%gyjp#L0+C9Y_YcMtI72E!aAsKOX(5(srdr%eKc&vK_R+~Tctut;Ru_{hpZjuk6# z-X_F(z%8Me0$l1A3NDmQ9?1AqGl5juvVleXF*3tDjh0{*LR3+KhWjckcC)^(HHgfZ zqt7^!WMs`a*ts`EsTYB-72Tyr8umaccofh>^u!UiX>ttm+bop82sOv=Z$+>$C#SyP zG-R%|?zNyS1qlr~V1%eymjcCw9fCX5(;Q}VS6yi#4P0g#`!f~cE3RG)6=QW5Oml{Bm5cNTQm@KO)@EtS5 z$gGcWcjz0}V23LJYXW6Fg!8Knt`N*z^{ML&o=UvyaFD ziU7rwHS!rOsZN3bZn_emaNnNnZBo^ziPH$u%Fr<|ZKP+s zLWw~HIS#Rodz1v-MN!y;Y}BMx!UX{-f^9^D{TFRxrY`Ut3H+I${<*YqTqB0 zM69<*omO@J{?UOVWQ(o*DrU=30OjqUq61F1j~0F&4v^~1l(d& z!iz*8_LOvKs+N9_qy$NX5lBQcfKBfmq+l-8E?pId(av)MTQ{6J%;;U3wAeZVHXPk2 zAfS2&am(<8l8$9bRPn(8Enm%b)i;noTAxH~4)DpSGO*`+j`S<|;}D+v6_DWn;TJ_E zO!*i%M)EaZRp*zyyW^uG2AtCx@&J|$oBKqIJftSE=n)Q#br6?X!W+P1b|C`q1S|@y z2C7g{`7AJ#60aX;Eq)~4kve$L*7o0|Z}Nm`Xbcv%;R$VnO5q2xBKCo_3EI7_Vw0zM; zUoW9tTDh|XZU97sxuXoH4tKTZM-)zPJP)LG6u9d+6$cSoPk8GBM~lVUB*Ztwt!Yn^ z!Vt+w?+b262++~)wz5*g@H3b7PtmgYbiin?aG=R%H9L%plN*HryDLCaS_jt+;D*CH zT%`E|R|uU#Vslw}Sl|kwB@xYpTWTv}mx`U35qDLx;zVWX)+KJ3N16?y8kn(a3-#d) zrJ)hmxym^v=ULREN+L5On~wbBiXLm9#uo@na9TsZQ8)rjF}sGZ9~#kkD{^Iv=6T}b zU?E@zg*vs`8}2C9)o~Z@mh$7RbYW{S+ZM7NTQ!IP|LuaoWIUnZ+=L)3y(kA2sJ6JuyR*C@{1)N*Y9Az$!Xw363g49F#{=>U2 zLO!B1uRjQBWfNI6_6c^Nci<7-B0fz9cXTY!cqcZ0a)CS!t-xp6>qAzE2z%B`MGo>{ zh+JDh6r;-;90T(d^i z4++VUPqyzVOmqzH)^!_ctn_sQ87;H_-8rH-^tu+|fv!&=tU;u$0zpnM|J0 zXv{(y5l*<`+1UM8HVuU^qv*Z&LbiJm0gW$283}@Q7NFwPa7J$^Bv{>Bw}nCS81NPc zwiO&xJeX|A$`*V=WRygzT!SdTm?MvJ#3I}wK(rK5*_ewZ z1zHhi`h?{xCbwQOii}}T%vO=D`EU;2QLLTQiW{`vN-(m%z>l^HW6`~>H}yu5Rj&AbyuX*@9d(eE+!c_k>EV8i4!!89NUG@3`EH8l~nop4MDI zN(16%d4NMHiR(nH%Xmt##;r$1_ZbX6NrOj`7xGy$5)}n)Fb>-ywfQ21F+o5(oO)EI zX}I(R-dY~q#>=ACNW=;&5x1?lKT=8vtzoBOp-5qRq{j8>uC9l?^6(CU9|K!D2>vu> z1h3v{tD3a=>ML*c5Yz{YL$N|u5FXFZaaE+yK>Vj&eozOFV6!}jRIa1R0<+GV2CuMn z;C3(Enq$L(g3Fk|aSF2G1Vr2f(eL7kFT>!~=)q7c+}!rxuQGP=?VKvQkS9 z7i3g9IhLkE!aqb_zGTk1)4y15*}pio4ds)@MUM#o1Ku$Tzy+~Nb;peL{(HRH9GPdf z)}-9Yp*uqbW>yqk)gbX;k5pN~<6n7%`$Zygt#x>mmKr`>WrjEu0p}09dXBa>WdhJDfSVq~6 zO?)I2Z%qp!iU#)5vE%#5Q$O&V7OGtyj~!*ONfndZFOZrD;2kU!(D+>Vw^^)aw~d`C z)WbpFYVPz}9wZhD05VX5;~w`r%W~q3Bt8-=ViY+PL;#=OZg#G|v_j))#7>{#R-$if zh1{z#P~d}v!pFf78H#A^-!)I9+eG7M7?rBXl&{=sL;R$Lq>C96=xox6jwW|mqb)-= z@1X4trFyZGcr*YS%k9K^#vHNQw!oYacdD3$gz?S4Y4Lv^tMFsY-|XMKe;9}DnnAMv z>(3LF1^Ss(ai8)ZKYZQaW^w%EXZnX9T%~@WH1_-UBJI-bqy7&BX-Mu9X~iy=mn*@A z$_h*B;GMD8uUGw}{%b<`!I)34cdd%Dvqy9dE|Oro08759Nz`NbW?}$y%*ZP~4+Z&8`cUm!J9&pij63JSU>IW#H@S_IJ6M=%|IL>{`v3j0aPI@&cLkk92Xee&D6jsbI#~=ce;!o> zFu70@?Pc^${T>*+N9*pxqWw|TcGN4=7d{D^c|KYF4yj+FsLMk$IPZN);I}`M3khvn zztXMoIGqz+NMvZ})g@jX)x7UU*(EO&X-a^NNA-{WY;wf zonyQg%ct0P3xjHkHktFr=R9_JSN^}(v!`%%qZw*6J~ya zFo}<1WZ*q?qLmW%f4ool|5Rb;j<4XqX-h!!coJ%=(h_PrP5dBMFbazlf(xO2xpvlt z{;UZNesSPk=Yzn537xfF4jo1-b=F8>JLMXnZG1>+NO1Zf#$Lm?NMmV?55%fUn~d?L z`^C>DuC}}Cr2lcKr--5F;Ai@yp2GhWR!&CE_|_i6l(m99<0l^1q8Gj$v5iVL9**?S zZ!}fs9JVQMXlfYAt#@gFHfi<_#0J^}}C48SrAjmnyAh z5!qc_I)@dkm8WT#I;XE<`AIo-mdsaCRS{Q&L+iFXsk8|D8ux30xg4qVEw!nvd<<*rC?!Z0 zTv=2yIs4hxo%$#=B<$u6wa8O}NrD7u)`Mq7@X=^S#xnsGCI*)SU!q8gtYV55aYA5P zh13cdw?2uPmEzJ%YbnL3@RHEV5_b+fkcW?v84l|HJwA2XKkk@>xv`=Hv*@xBA<9Z3 z9+s~v1f^8IZ*LtmSsH&^opA&lp(M9U86sg3_PuW561QU^W)w?NZx1DdsIEY{;Kok4 zSq%uHP$DVFYD8OaiO<`FMb2lJjYD&86|=?(5nKo4QVJgQ87og_7gTV9$qn*W2T!RE z2oj4VWkf@!Sy9kG@LKd32!&KC*&6^GWfy@!^_rxNFnmN!oJ8RO5FQE*Y}Hl^YmH2D zyc}V2iUSrHZ(wt(M*YUI8#!BLhVLr?L0Nb-NA@e~pvLlK!Z01b3uq-bxmY!p&E~W6 z&S>dyTY2xNP8M*5B+->*B3p?5<#PVhHmXaHT62XZWu_`9&OX4?FCotzXQ%S<#1;yr zswP6OXaJ&?bgAz|5Fb_RORlRzE;bDEnASvB1u3w$#}y0GTac<4+qF2XOqsUz_QHvu*BkM^p57!z;d2yb;b3;N3s`@8TzAco$1**8j^H zG%i_#K)k+|s)PqWwqvCBLxiutNV9RcwTqq54=H3UP6tqM1PLE|ho*A49RMtGPRlWP zpnayp8)iZ7Na^ZfVp}WPngK7nyUNjRg}<`SFY;qCtqZAJu(I4(xZ=GxeB`rhTh%YM zIl-?lO>=^_qlfp*))BtIt^~;{h1Hd*Qbto==lW$3!5--R7oIqm$>{rfmoB+URGD(o zTLDDBjHA7V_}DA&jx3VLBlkJwD0Bge0vJxs*qNXM8?7cvSomWPNQq`uK9nopEjs6r zVMX?~T7=0aoOjFR={~s@I@bsa4Rzq=B^wyXe4N&$E~)32$k7FPvSHzU<%n^)IC4QG z`|^c^qK5Q;ffN~lFiCHm&qMsENG}apbzy@(N*07Gz3T!xlT}WG^!S~@zWF1InHP=K}qo)YUcO=LS1M^WV=f(XI*4&PO7%4~D zl+K9j+e!XoTs>^SqpTS3#^IjsNDcN5S<)?ho%1gJYIg+kg4zkfM5>CGD>PbKT?B>( zOzJrkz1(^cXf`wcbtlcbocSiI#oO9QT<8SKbdQS25_AW{Qs8n8oVKrE*nikwkp z4s^mCXO{Xl^THXfnt%CD)om<^8Cr1QPF`m?Xd%S|8ISl^9jU3p3IGf zQGB(QqmLW+<7f;I+c{m{TWgtZz!_|SJh}98fap-bYqx-%qxP~Cy1#_a zwjU2?c!B-jFYHBWm}M*!U+Oh_O40rx;E%x`d%H$;U^FJM;Om1ac88asSY^+5`Uk;D z5vwn$&h0E{C7bY4tWt>fb7=4%pku|H0qC^42EjNE@eTQ6fiYNovK9a{y`o&Mxk7@6 zAOKr}rZNsBNwR7|_}C57l`Uq=c*VJ}gc{oBExUFO)K~f3xd{idb~{|2C^{tm70Uen z^*vj9_}0ffC@2>W^cvX^($~!*Xv73rRSK#n{LGGj0I(ys_L#e_^9 z%dbu15C{sEOopSC3*fv735CXZe1M*U?GFjJu!8%zI25!_$CKxc7t8YbmmyzPI|Ca! z)QPnK^B?IcjAREETI6nZJo(uMlvN3$*mJSn6nI6`HB^l#f%u;LHKXeka?vKVAmES3 zh;O9y`rxn2A>(j^FPVhUqeuq)sET)su6`D^QOerY0G&KVVCa4+QKGHJdQDN3la=wR zni(+1%ZTxU2lShA%szG_vK0cY=cCs%$A>w_zzJi^S{+jg6H8tjA3rX!solPqU+y$c z_S3;iR;QL~NlMgE=4b~~^^2HKF9f_h*14KcOQ=v9ittB$kak!Xu4z9dU#^5%2#``~ z0`~eSB3Bhvf39A}N61|Xb2I>e+6SnUcepxmyE?2?n@2-{$kAh~(dbYGOJ7NV&PB~7U5%&(gS_9+)C3rd? zP;DXQ<-BGSX87YArYpl)QG^4@6;c``3{f&9s^FZ1F&G)|^6C=dcbwbnKZ*iiW&$Ol zgGCkXixr@?X?p6_zg_M(Y8nLR&|-Bd3a}Fzag}vZ#c)O#Rbl1^F*2KXE7`Lhcd!P_ zQ&-wNM`sAdze$lYCG%++eKS6r7|9LXq#s+bRs(Txv`ljwS|1*=IJxYkf({EST|m}U zmE5ePwxJmE#fv3)rS;pzOeP(W(?JIyiav1Ss~}Qaqbud$_3;?ARI++3!1XmDGYrc@ z@!X5E_yQ$|5j>H^fw%;3u*wvI@eRVE*~AnmWH!W&Od=F{%yE+*(&@z~Sr8R_D0a)+K=-!YPY*vJ>e2^3giD^mDF*z3bcW>Z55JE1XRpJ^LHAANpb8e;AN5 zbBQk;Voc3iZm}oX-?_z1@MC1-+XjxUuDm)f`nF@V2|IIjpi@o$WiYCvZm2{wyU4j- z#X|U`<9SlyK(Dc58d{t>&|l!K!E?8IVif^-$Jm+q{oRvB^tKM?%de!=tQtGi@R}11 zCNnT!*G5it>`rWvOr32d_)QJ~Cn`VQ!&kB7><}#+fH^~Kh=uRM@uU{L5M34QIl+Se zx{PtX?F#FXlstZLS^3|xnscNVF_KX*u96YBB=tU%G+UtpRQ#H9R>!$a*EEtYD1KNu z^0|n<5s5&P2EwRQvw@Z z{#?^_+w)Vqjjdbpo#rjwd7OdE1buKO{Y$@Q7v3W1S=e%rUe*Yfp(j1Q$-3Hl9>IMv zOTO_qT~o;YQMse0(ZSqXFd9oxco;)(3AfM(^jEqF-W4|1R9!8!;!?MVEIhdckAQdq zn0Lz+IjNgtjj!MHiVB_hEML(j$mE*RK)w;;f$d?#rnxv0%l+tr8E1LcIFdk)e-;ip!>r9Vr;db?zO{jYn5>DZJ1!60dlKY)?`;a2CfMo zNq69hr(PU~;ARfKN@|gai%eBl;RQBhAZ^B@gqPKiiUrEHt}wiw8QqX~h^XWdGF;v= zFaY4{4xndd+`cva)p0cg(>W#z3|%~S8m@n87VP_=W>SzR-wYNTj5zxGZ{lyr!kK-q z5?4Qz?+spi3B)&E_!NV^-v~UsTuS*sSObNSb@K-DU`&^#+AgTkY&vkgYy zSZJ%l%n)3`s{kHoW1Px1w4AbcA{}ikHrazsoa1Eagc2iLg~SOGLN+1VVgKp{X~rlx z>LWnv3e?ott~TvaJJ;0XtZ~DASzg16j5WGaVTkZL#1gzm1cvtMG4~L~E%yBhe0E@E zrbk)?(Q7q!_$;d7Og12D5Ie@k^a56RVE7!Vv7+h}R#a4nD|1aAo1a;nxA_`&vL>EP zliLBD!HOi?f-NrC0<~#NHeFzxzB}DnoGzWlsir4DP}65UT}4U`i*jD@SQsu@3(ScM zWhh#C%eo>N6k*rs+6o4SUA99OW(h7OZ!q7r#I7AZUTfECu#v&kV@O*${W)H>vJMK< zz$`cx#G!L1Y~^v-$`_uvGN@E9Z^Gi-E_`z`cvi`aHYO1VR5g`etV>y|UxSPm7K zmyz)v1QH>`?rWF!6<9FqJeoKoe&Bi&z(z?WTrqI$IJb6}J%X^+p?yK~jM%an*bp$szU5!C$A z@Qsfo7An_O^TKjLP?|D0y@N{ct`W_N6F!l5pE83m0L-qmVfgu)`1hFHK=7_R%b)n**=k92@D?N(>X!Nc@@eKXB8^-mK7(lQju~d zWm01)5+V7=vGe#b%H+P3Wj9$*_gWTn8(qk;ECI|8vMJ~U!G0wS>AhCcaT{Q)L;!P5 zE|laOOk_;nn--=g3@nos-s4*CG1}gCjyWFex`+5ICv^~=+w^;&4Or8!QVN|2Lhq5L z^V`@;$2cRMTaUbvS+R}fSu7sKMMSxqECJ}?;iY$ViWP-?jCO&>ARlQtBFIOTqw*zj zX#q{0yd24FB%i}fIr%L1QQzSdMMXN{kGjM{E|!aaQNV<-@bS_>)?{5aWK#~v!9SFE zVaUIJuIjV3w*$Xi`#=7F`*^pul#QBS>(a)Pe>MK;=d1gNKmFbgvsJVsG88C)kuAJ} za;W~x{XV`(F}=(y4L4R+HYytx0d3@fk)|d9MzoARpDz;5$tw+8297!sJ8-kMLF&6} z+S($3fQyT|Sl$8|@neO#ra>D6hs@ofQ2|Q-h58?L(w`|<9X3_f)dD~8-$pEV{az4@ z5o1addnUB#i$&iL2EnH2(~u%0Gs3!*GaYtY;NB5Uu_{|-BF7i2fnVUkvdgM8W}-Zz zbS2=nU0r9;u+sI#)6Z-z9Ktp|_t;8IaE7Cmqhvk}gUgp?^Ne7&sW^;~$fg{A8dCJB z?XFPW*m_@E3Jnq~!u1?Fg1x|Jp|&_$vtKU&@>YMES+D@F_j0?rJTzp)@&Ab7#z)_x z^;<=JJPrr>Yb6$(>-Agp%;<9ir8F{IW? zamWF6%7WI6pcGtnyU5{&OFEwnN-USoR`#x$)#O+OIgp7|$=GBUr^s!g;&VWaudbFF z7GfV(bY7W+`1vnk>#XZJa_f{)zc@}XOGVA?=BBlhqFn$=oVFxHdzw| zZsH}@NIBZ!6eP+*#k>7~BTMS3TGo*I_=z!A-G?1vunZT}PXd3)Fxg30G%k{OfMOT( zs>oA5cJ9vX=?~8%s47tY8lE%fVg5yZVjjV6KI`{zoyso+THnW)$BR5gnrljS4A^=( z+s?P+VQ<=>wwv`X?q&`A*VqN@gsmf9DeHKGd}gKRFt>9!(vnk&3wv(PjH#N4G3!X{ zfQK+NRgoo8AU)UhHaIo=aj`wD56b;~FWrrI!j->tT+e;>RlmZoN5A`~-;n-8s({vf z%GB+l=7-YAl@#+6&>sWMkZ?){iM$XwWty{Lgt{_Hsw}NK{8LlwHg?U;z%qvv0c-(r z{Ad!7!;brYuuUQo1T+6}1O6ze!qn-HXY}~WC{^+Ca?on*nwp+v4gq?d-%>a7+ySC7 zhDj6-Im<}Oi}5%m%$ooh$H^9wWESVL;lC1ZL|ylm&S*(q2Tkm8>&%~StH5lPFKL>k zpH-x!YYs*DTX_-BX&6S^1fyscKAyfmx@Ak<2}I6K@0hOYIyaP&G-tMCKJdpapvT0a z-G~3J*xB(zBrNZ@F0I87En-sO2)pMXN7y4u>5W2_TiJSsOVzw*dDNU9_&ko&IQ%-e z8q>5k*TTPvWj=%d>?COE%X5qVqr@vw_GP#+>hU|Mv)k5*KQ2Nx$F23y1<4;~7Eh~W z{1I&_WicT=Na0QwyxyNkNqJ|qsu8th7H31?In>!Z6w>ss5(j zc9k#FMLd^b6h1o~`1E{H_RCF5u@N<^NGU-h430ITdSZ{w&S<6bv>K8qpxz>$hckaF z-R#sGNz-hjP|e=%gMY#q7MD|aO3_}ix4{H&?ApA0))x7Qd7NcPUIbK0^PS-pyhSEiWgt5g zT8hdXKQ$-Dco|6)c+6SF%W&b(-I=zt)(CQAXb9oX*eiGQ9nSny{g?E=O+KIsIs_2d zaoHn(N0WJ%bzQA$r}cX=;XJLRmCh`x!)6D2Ui z(T(sMmSxn0K2!t6AWd;v4VdM$uvr#mS%H>?LxeRma9mOmo;{ECDUVUV>j~h6!U0@J zZz`K}d#xmF3`T1w%t(GU3L>o9C4w{p%{lH6b%gH7kH$6l;cd0(rI&YJD`75OWo;Jo zIJsspTE#K)MbLIaokI~D;Ub9fR}A3s3}LEtf!5+;g4p8bCYIs%ACZj?chrm5PN*5t z05G|#CV|d5t_r8M`}eXwEFN|>W6kHWOs&O7WEY1&4VZb3eF~H^MKb(kZI;TYhi*!i zWlT^Nk6y8er{Tt2ES8NJm3#p#X2Z(DZpIso0@{y!aF=C-9+<`XQBu2a^mHM93Ui>J z<77*CkS6?)xh4P9pix-h4`i2CU&=%yyK%h~^QSAnoO4*{$I0P)iQ=copYFN^Jc#BZ zBU)Aw%4N8PURZB79N41FCvkW*XAJR~gT=-9p-x}uLWc0}b1?*nf^5jA@`53s2tz<( zVwdy-V9tL}Wy=l}yZl(rv$BotzA z5}|vL#3Xn(7gv)6$~Z|PcI`>Bq82780(+BmA{y#k8j{YjN1CJ)E!w0j#xm!4H?(Er z?Lh!`U3MoSpe|N%JyYi*%`I4vSH!gYkPJUHWE>y7?`5VYo#Snr$v(}ZS<__ z(zG@&k+Iz5`RZk=nVrDAxbU)G22;v%8|f+N8MJDoVHHCldi{+gQBuetmt?QJY0pPW zT348g&1X@;s_)g)v|_Zx<4l^_MVrP*P>6f$F+}+Ev_#^ZYOcb&61>jWY^G!N^2Sad zQfR1W!s`)Qu}+J~PDEBZ`f1Q0n2NsZixbesJ%PTX=Ns4{$~zFY5_U@$M%f+Idwpq2 z*{rD7$kgzyHM7yvVQI}v%L~?Agg&u6j~q)ei@|r#`Eotth4w~$cGO{2c*!+2+Gv<2 z<7qWU?7p+rdbElsQ_88&ND@rCa#l<1d}0G43qj+~2!pHenfEh9QSWVgeu6b})#lro z(Z;AEnitJkw^^$u+8Vov=ozh@g#N?puK|ZUnG0gzIIRugkO1mzG({+mzy1SVL`VW* zks$@V#HEi4PB>L8I9(ichIHrx!bSLzD>v>uip-N&ycnHv);YhNF9VZL{CxQ($iJup z1PbbrXfd$_V^>e;f{P3;S0bdTHgSmcR-E`kafBs`+Y`SF7a=lnJbZ!@T&ba}jIIU6 z5Q-I7B1!Q?#1cx9>~(Y6lSEP}CHL&taWSOK|Mp7X@=MDrt844=#^%=c&hFm+!Qqii zrsES08ktYeG;6Zh*|Pm3e4Ss&x$JwZ&T_@dcCF9N?Ol*3-%n*SOcgw2)RZ3 zV+S3wz`{0h=(4%_Jb3cL;ElWDF% z5xGm_+U_>nBE3C1p{Z6Yl~lK*w%TU99oATDzXJ%UR<^euD8?FXg7NKZq7m(Ppc*d> zcepR4M%ste;YOK-?!6CoVwo&hkeOzfZWWE?WTxrtNb}6KqVDD(J65w67@^S_OkO^I z0YR3KR&CmKw5uDsbnDTpPrm_!h721qYRtF^lfKsBjtrCBcf&)s-F4F~_dMv^_X25e z=$Qsi@sCn2K|umBS_K8ow4$eMiEu-lJP3NtltOKR5UMOH6EIq#GBFw~su21z6^mR@ zaUZFWuqS-lb1D3l!qL^jG3?1eTca^IuMT`y--F549ZWsoMu7cB@MZ{GE<&1xUt2bb zm8ENhT&<2;oN$(+Rt`F`HB%?Gb1jS=x{*^1jFSSK4Z6|vqJe|d!KigKgw8HIjAsB6 zs9ibkG~(tQ%)CV7j+`|=PQV^hj*QD*j15-|jq~qCQxYlD=S)vSPCuVJwR12ta88-K zXpUy-jUGQ~;r6VVMeC;uKSG|GbgJ{uKN0OhUE=^_3uy5EeV(zp$}kLT4h42!Bg%CM zD~4@{9bCxH4`iwGsu5G6w7v<)syn2mP+<2IA3q8}|EddH{Ql>YL29h@1%mA|y$1jYNkCjU&>a#`(3n#B%&@Uzr`47nlDjB~ zuB3VUUW^9KfJ5NK?>_Kj;H&EQjC0LL?@2_$7gCWr)eRcjnaFrgNptc|B)eCjK9h?b z92`04+$FqJ7k3G7(ary&d*dCYsZ?$RUOHfmOy_rGe%uSQG6Fl0)%Vgv!ZGN;gP z4`DT)T~j2XR>&p&4f+skPHa9GGxFish|I5IMm`lA5v+VJM?^tH8xu>HL%*<;L#scJ zWAd|zr%&3)LVuKxPP7~-fbZVND9TBIZ}lT*TY(w4xY*c>zSW?gjgQVwgogY&K01_$ z&5)o6IO3-E+D2(_cIYCv8zf>IQ;ej>%!-_i8NZvq0bum|_tD)Jd}Z-0ixWR82b$}f z+3i(I43nz<8x^l$QD}za1-DgO4qbKB!?X%XIQ*~ zZ{WlgswadUw;lK^vAop>o`7`57l_<11`)zje_!LQ9rtaFO1Q04h`Z&p`If0?{4u3F z(v5>*oS4Yj1*w>TZT<`};H*!laO^GB?|$rC4WyHn1Ai5u&j4p#|FWyeoc3Q04Q|G( nJ}K8CqCNR@09X0RR9100000000000000000000 z0000QfgT&2SR9T-24Db+E(n1D37iZO2nvMJ41}~&0X7081CIa;he7}ZAU|zobYUO` ziXsP?aSVYyTNht(2j}MGo(f*+%_=XyGiOd61#;7Axm$H1P8cfS&JP<00DSV!0##L2 z)wHBBj<$~7Re~VzQ}PJT#Uly@$)mz6$>PvbTcAQ~+hRXzsOhlvd3G+A9ZH|63QuR& zl@x9YaxM=^BeqJSz)NFA^tMx_r8g4YPI=>vj;=KYP%H5aYRD|)s z9ZIYfetK}rBw@%jp!`@hky}r(ryDD=%5*CoqAug%*bN>g@ssk@nGX9!L)aigPwEQz zJc0{qO15XZ!OqUk?-UC(jU(XGJYR;t4f!n)m&k}%OQ0?z=&ugpz~fXDc>F_#e)u3k zpP|i0O6?H&38CBE7OZJ*mhG=uhPBMALn1*ktOD$o_z-Dyj6p)Ze!07!{wsr$_up!w#L@6kliu#ILT{YS}?z;G2 z@A`IbCnGdsDVVn=jadFDf?6ZiD;Cs(SP)C`wW2{Ri|XK6$mJ1N!1Hwe*J;8f z!B>Z!@v2*$aFpZlu1SA|(c{b+Uo>|l)ti-PylCw~g>)sCb8_S$!euNp8K`cfb<`9brN_V$RN6Kd0zi2wrnx!#>#!jBTZk7#UVjT61z%zDer zr$i~$0>EH3u%OVo)@x8^n-dj2;_>e0do~U0Xw_7A?^TPd+DmQ)pa6a>V;D0T8=w%K z>;+lts%ZE3rfThb3sM9eiHk5V;WF&%F1I-rRVsSm?JkzsWdsPA02Ihw%C{ZfsEPxl;B$-TVs;h*-VnVO$rPO=!&-mQh>76K(3W# z;RFjKDqL;ZuvrK}HFk;wBc{IZeg5Wu|8H01?bGQFn;;@0!3L2uZt0;Gbo?e0>q zR=~>3mcN7|4Bou^*E@;950OD+vKq05wHUJ4hS9QVS>9O7+6{r9q4B!bwVlcX_>!6#uhaF+*z8>fy0^A9uBGcfU zos%l<9#Gu4gf;Y5$&ISNV=Ojtvm! z`of&T+(%!-UU;i8ukhh@dy2t@;U)j!z!&ivJ)Cc8lB8?OmUUu37cSDc)xBYClXhl% z;_4TQr~Ma;;ZQZHUy_gNS5(veSBaaUcDwP1G%(9^?n!BrYIn>!tAv_KJIGJA%wN=? z$wSI<<|Fd;3F0#=6vEnICxDDG#!dm3@N?5Fp(S3hSyIZf#Y;)(;vAYdX zf2KTpGQ=jiZevAK;pYBSM~hb(Y$Q^lQ~MX%Y<{4hVy^Z7UI z$0v}z=#(7H^9k?-w;DXD)d~dE6zlQ7sZAK%si6*NtTj9R4d+cJnS`;QgM{;h7etZI z7Zmr0Y*Z*jA*5BQIMo?E5gtG&u{@|GNsy;g?3!iGV;S+=f1-xNoZ(Yf$fK=uy@!83 z(SX8K(I^J5vmrvr8P>e&V8~|KDr1D7>k}BS+QtwQze3br$^TcDlJ*j{q zRo&fRw%|KgWMgLd7iuowy`Aq}?q=KEecIO*++;?K@JH0p+Xy3#m&Fcta+oLN^A+Fn zqrz08N(>%oi3c^vZS>A|>y2SN+>qtsiHmP1915F9Dlt#0)Y=R8L2vT8vDis2_PaRj z;=GG|7vH$}F>8oe*OnRpdUG(x@t^YbIKCX6rgwbfe0}C`@=yF@!D&ua*%?yS?;XL8 zQCm#9cL0t>-)3lR6GeqB{tmpTL7CU^O~EghPVf-C9WA{O=InD@8ldLIY4$CWj^rI=5X>(~g?lYtbqI12*h zn6X!S9|r&^FTMZIklDM#P}iu_to*U5`KR%~HKMvL*kwTi zgXTcdPh4^6ru7f$ILkcm5#R!whZae@lij4SmsZw4(VscI1}p_jrfO=~>|T1-N1A%v zNUv!V*Tq7y)mHzEhj2pyE^Om~m)>&ZqFjdc*>3*I&*`sdW$j3+vHw!nRrELXFZB@q zd$YmBC?LAExh@EF;EZF2^%WQZ%;1p$T$iz4z1xnmUaj0K98f^=Zp?q_{=)v4*Ud+;Sjg-_W*iXUBjB2s9P<1iQuS!o2Zh z%wyb`$9xvBkVPzJ2}@aqF?VoRn8FpINJS}H#T291iZkXI?kZ8qN~v^ZDqFeASD}i* zhVE%HrsBj+%G8<$(`?#Ir|B{MW>7UR?boUE5l{cBaC=p_UcN*8-2r_649Ei6Kz5J= zqpv7eQZez8Q&MVdoop7fnZw+|r`_a1d4Xz$DtQ5hptu^>s@65>0Ya+N>FPpFw_b0s zUtEE%p5t7DzYNA;?kr2Ss{IC-nRL*3PVW`&@uSai1)dD3eL9Aeg|r@Fo~)1olZlsW zdAYPo6|fNf2j;;>cYXcBU)+I3V_D>*QL>;k<`uc+vr}6`&v~o+zxG#BDat6T95b7{ zMjJot2~T;Je~uRl!6;mR$&DgZ3ZqDsW)!6|jG|SRQ8ATc6r=KtVynQYcoi9is4Am2 zHGLxFvy}8Lyn%64GA*HsB;t)(tbgu}Y{No$8rQLY(M@EtnJsKhD9;O7y&TOkluD|o zs+u_Q5+q8JJag;DI_RjA&UsP#!i=>qcuSO@rV!OEDp0rkiZ5sgT8u@$+Sih$KGn4* zuDrIZsfFaKdGD`Xg3+JIJY|?xg?9V+Zis8_&TXblE3kuEWV(Rn{pqWhxN3X%!rP#z zO)qAMUFvkjaHq;8=3ird|Na%&>a-{ZRs(!7F$ZF>&MaDufmn!)HhcKL*JWhlz?&*k zWfY(aC0DB^wQN8cE!QgDHxVXzy3R#&)4XeeWk`r{Bt>lU4e~ScCSlvVLsa=g^0(!; z|C--X_!$%qE9?st`V_kF za1}f*cu}yoU`s(36f7;6Q!wefPT!^At@oSXA2o+-+D<&N3^<^;2{rYm!}OYAGh)^Q zm|SxOfHU7QkInOg%}Ysn%A4~|q4~sWk7B+s-%LiLP2XuDfW(!*?hBreTeYoAlj_}a z9nkb9H>kn&YPCAlulDt+W1ZTlb=shAO(VSUCXi4{D5<= zHXOJ4sST@d!`rS8diuQ2u^*0eRrzRl9go!il2=avIJas0tQvdbY4dCFfyny;a4 zRlB;jR4cVitF%Bf^jMinYFyjYznOZgIohn&wdtAmDWe{}Q>&62UyGX8vhJK1?0De7 z9Z&p-CX5Ioi9a%95ke!)HPKXS&00$Eb;sRxTNEta)beRfrNphD2w@Gw|I9I7Gk5mt-?lCnzrMBnP@{BrO zP~U4BdP@UuXz4Rr_>?xjq@(ZX><2pep00kPhhORL7rObGzW$<*KN+5lA&CM0V^|i0 zgP|El=VD^sOfsziG74gPf!NqUPruR6-+HAddaZoD(Q{qZHQm%*<>|Jr=(6tUx^C&d z9_kV0Tw-kQjLONF+=z9C$@wxRKaFW@BO2A{Mz&Ck)U(B!r4u@&BRZy2I<1pBtfM-v zJzAq|ZDNLM^&q?}t9&r9RmU@c_6;kj9A)ui8G~>4{g4P_@l$#9jZWAn!KYpzT5l0-z3XLH$ zmOB7+s>b1K$8wBKa^941+7osdiE24dh?9fmope{j4a&MZ1=r`A%u*n7yjT(l*N~&9 zF4PRT7=w(NW?;~})r3LaF;u;xny1>hCR^>+0#2Zq>kdRz>@q0^rL275@pwvAHMJhH z8Syc4O7p2n#ya%zK&~AHufYV69(y83>?7)ww`R~~*c)_!!TJzt8rGeH(4H%it}lo; zY8kAqzOW0Uo5#KBorLRmZHJG>fLw8O*jmKx!g27%T;DhlM57(5j1%TXj*jcdm@%A_ z^@LfaJE2fB)@X(t(~7GlF_UE@S<4gSQ_bgTgt$kDQ^B@%nK&S82i}NoLf^8zf!iD`#7{W8g;ql-PI?d90<$o1oiFx!Whc z$eW|^im_XIF@SW#q2E5fRg1dn(w2f-AmyEwwl0k5nq$%lMGIilmFkZ5LPVkt60^yr z8iUE0E`MB+Utd>^KDR?
  • Dp`DfF>gttx}wqcA~OQbFbEC6YJMC1{cI)u<;)G}`M z6!6-KXG(V%Gp;+vbds?{yp<#bNl3$rbCv+%@`ujaybVOxu5W~*%84o}sG)*7DrlgBCMsy5f>WrVjS5y!!73_R-J*3|Xf%(M#u#c} z8P=_eW7d6lp{8YHz(&8Cr!BCaK%=Ufmb8aYoKhAmKufr)C9`Xum+yGGUjitz=HIxq z%1kvSL94YbK9*qGtgTQdYpT~ubWD=)H6Z>ygRs@Fx>SCtmnrkv(y?f`40thFy-6|E#L~62B&#ptt&Lo+ zh^H)T#8J`EYD>tmR!%%%CMw}RN=qBgbPy*7)CiS_gqjOBabV~USg2Opc&BL5#Ry&L z8b~@dX2lW>Uns%itqITg2CbTLSIm`RUO7WptEO|ZrZjtklX~eBZ+#>uJ6{ozFyqPgEc*k=O;;`F z2U!`AuXQtbe`!Y6YuKEGH5Z6}T4pssZDQPnDxlZAiWZG7sR>@UCBm5Q*HpXqKg7B> zQ`sn-9lzBjQT@hkg1N9U9r^?7I5tD;?lyW7OC?nWZk1))zq5mmUYZ{3JN;K=d(jBod_sLMBl@2|^)JSs+x>uvU$;xoJk!CWMZo-obeoI2see#O2vMoFz;1 zK&t{_E#(brB5Wnv6$pFj>9D8388w-Y;0c*UPhf}V%5O#B`j;CANK02kxf#;^3hb_? za7@&in4~rHWYl_NO6N^eO*G9!)32!YMuyhH46TJst%aE#M6*mZ+ccPia&64j+L))c zF<)zAK?l)76D=~`FRu8W#|-x;IwDTkjMeZujV;7pzhqt=a}9XMDtXy|(7vk?`%n`; zkW~2BtXxnQ#J&Y3FI?gE8@M{)+44Tlr<26^yScfr_y-1 zwRdO1kDvZ_qHvE7fi}`GnDE{A?^EjzDI%>uo zmqqfbx6!{pX`@$HtJTNcX`W5UsWlcZy;8*DUYo|?q}3&Cd+hOh%DNp?JO{%yJjC4e zzTAMr;-)+bzo3&$Ow7XLA$09MVn6^h4nPD=9WITAiwCskNa)?LbF|+?-^`lnbv+_x zd|2oQH*Lv6sGCaRj@;Sb`H2zS<<6HaCy#^XqfkWD}roK)zWy422T8F3o{mv zf6W9>OiVTJ#mxqJ6cPZ0RG^g<9ni)Mw@Vs~E21NYFikB3xTxffSR9!rxNqJ}ssYJ7 z8p379)aq&)Rm#~bDcu!Vp1~WYHotC=MKF!QxTaCwf~|!Ch8)G>+QST3XEALXt(Lw? za~|(@&dqztnRGLj_A*amTGAD3Tfb(Q!4hmuyc$CU-&jN-D${lV^p(SDU_!A&l&gJs z31xEET4{~urMnkv<~=C2Sky9Y1li7581=fnoL0nsKCoE=*oIqoflgh1Cpoa+0?#Q6 zPD&T98a?_fwP)PZp<#gHUR*2MA;ghbI2}dPhNg07u~9$5qa4-)rcl~rc=9Q|5|mWY zaVfaU|K><^x>W|#B7<;TV_na- zg*=C@{aWy)XyCGF37XIk^@lK7_A+X-kZCGykTJ*ZXgSG9Y-*6&no!DUq}BF>j|R^C zuC0bt(4w+f0nIcHg)2;_{wh!k=(S0=Sr7}V8w}aEg=YhPbkUW%<~5z77<9!^yfe$8 z*SSnX=E>l=v*O|w*2f5~wRXL_66?=psDf-1ch?+V&kN=xFsTF0kwBS#g*ei6O|^K{ zf8ioNy)FYg-JUjpAQG%D!efOE2hCdbwk)A!LRMbbSOYJgG-77Vf~m2!WkG`-3wa{4 zco7@2`CTZkIkga7O0lX3l~MyN=ks@fT0@QqYayiEmm#c%+aNP%u#DI(D6+46S(Z9g zJJjZBoIhQ5859HpV#iik8`a~TpA8x@cwMGtnV5yRb)k$yqiA=VAx<-{@|qepJM|e& zL?D;bqJy@Y*otN(E9{xXn5ySzG^SdJ^Q{u+UhpDw=cHK?90exKqC=flszNka;nK(R z0t*HZc+B2mpQn~j>}@aa943uu*qSz?z=yjhq*;c%t^4=kVsXbqikV!D?H8mMhvWdB zLseULrcjHE48%M<0S*3tP7gi1!S}M!Kj3;yTj0|qNNBmwlTO`!4Kb>3k{oD(*mzPu zNrz)4brM;U8A({|B;~Q$Dp1I_%9Eos&OQ@oB&~lEK$*^A4geC}*x@X@L5(!5R?GIm zuz@PWD1R15tv&+Id8JLx1#OV>AhW`gx^(gaXLBDS%L#o$sa5dsvh26)IL0T+t^jKW zSXqHaYd`p^IMB&<4>njyvHa;ZtSy~Hv>3agR#;=zS#@AgVd1xLIOu8H((D{$Y+X$| zAZ$(uw{#^LL(AZv-Iz_8+YUPB?b`Y{C>XKYD|=D*+9s&0Zcgs~E}q=@;HyI;L-CGV zT^-9Ur0qMa99yJzxD~94mp*D6dP*H>7BgVnp(ld_A3)p|xp1d!iTBd&CQY2s^jb@@ zHw=x`7=?cp%Q*bvbc0r-tIJyOe2Rm6t55%B6D;vy7CT^a0o<0@&lvp_|N0Wrh;AWA zyEGx@HAVF~$(xU$(gqnTAOXh8)=g;BVl9g!^;E%;J+qJrH!%M4?=5WETc>4^_>|$rnySw;&5~*$3x6iVKg-ObkvNVNgA~|<@tlSH6 z@m#+_JEfglLp{6@vJ^|VjmpY7NVU?Z%rt*VZ>6=O)?{tJx3FqkiKiSa` z#-vR^rY3j`liRi8t(40>kauZ`sR(L1Rf_Lo5e zB3K`UM2m-4K?(vz;RIniI}5R`f{My#Va;a!YCsj4Qo_Ga-A z*j)Y?t!Me`0R-O!zq_68lc z>jg<=;5HY*f+V=p565=bn?+3}K61$eAV?~FxvSn+?!lfxlvw^${l*<)PesB2wv>vF zU(A$qx#Rf*Z&Fq2Ao*BF2RFSGhLT4fEF3D?9qgRy*dloPt|(|Y_-T+L>Db4@ux8pdnV$#3H2xKLdfF^DXXcFMJ5~LZdc>4IV zYJY49n|8Zl?F)B&!!CE0nZm|1*+WDdQRA~EwOu*6s=Vz-;M|J67f4PS)}qW_1R~vjBK8c* z$GcQ>Nx!`#GzBq`bG5_-rjt@CGXD!K%NNc0nG5FgZ&rC93ztLyk$vQ=!U|m0&C&~J z2e1h7S4Q`L(g;I?7)B3Am}o>m_1Holf=f*hEkfAU)K1)@!9=TJ85A)`%6@kAD4Cub z%n-Q`=rArk+GDTG#D&r@ex>7=R#j!&fy4Fs9(0Ep={*HUoh(7;QbuCG90heLT`D{+ z+~p6&1iqGLy2LVB-auWE2zx#PsacQ_W7u^8KPrf6f_B-jWu8%2I%*?BanKk5KgA@R z8=kLM9?cz1A0yira#AUYG>Do1nFv_Ufjo?P#h8z%{hyvWFoCq#i5&#Bt)DZg=ol{| zdBEKH_xgPt_ba>Tl z7W08Cr%8Fre~9?P8_?hQJ8;Ko5ZBuiMM*3T!a}pNwwsqac~j?%;WNO#G&$K`WCof#Qec`jZ9KXGh7*4n} zU=NhT3>LagJVesBdXGwEqVL4DTzg#Hy_$Z6I?}<#3_rf|PeCF8=Ot)_g#+P%0U$_+ zFB(t!@Rpv{SSedfzD&7Rkg?pB7Cb>V{Mf!dkrpd_XB{^Y$HuYzXZ$G3!)YfRap3%9 zf-X&q?T%5k`1U?~sZ8g$=_B8v3UXUXavk3LlrGz6`n2~(77r(taS=+?-&%n{All#R#vNL>&~WVdjV+n01F#P7P}yR(>H?45TI{6lo$c zuoq@Yx|J5n?ZuY_E@o3@5+-U#=feidePdC?TX%@X{NRs{SAnC(>u^NH_KrYbzR*aF<7xR|S^4cMCafyPB+WKOPlUK!kuOQ6l>O zN2u!cpqc=v4FFzW*D@dRNM@xr9^6q`#YfJXM-8K)I>|Ot&*XC_2+BC$E$7U9X6nQI zm66xO%XyWItr%Bl7FhUtjij^%d*x4#YgMg3qNJD-fsboQ3a;nY&g-owEzytgz5V3u zsRB}7#nyDZv2!;3AdtKvr6o1cAZeNF$qMx6CybVKDL`3Vxr@4QO(YM|Xd zus#<1(0KOAH0A96Gb{KaL;!WNHB87`0@3wQu4_ajrQT#j6Uo#P7~bYr!iWwlc1AF4 zhYj$xSRO5Q>|>-+koFfNveC%(yFdb)DEss%i$N7bAL}3;rj1^enh+^);P&hU9pE&M ze>ScdfHt1DH5qM%IlVos&BL{=iavyCD8pn@-kPW7>G|;0cRZixKOnTX#&l`Ow?v60 zZpE6QzBW58I#TcWvEIM9DRZB|j-3%dD;6Q5ai_0oQfVA!W18TIXEWwV`RDk)ISU*; zYfHC0nR3ZfT!>+o53!dm@532+VdL5=#v3dI+pVQ<)f}O`QTc~7#PQsxeK0)|wXS&u zB4x4lPMs?YkM-t+P`YsGH_yb6p*#cWsdj(D-_NWRKdpM2% z95?^>&*rnQBkPWx>>+CMl&G7S*5aMh{1^$Co_Eb1c8%AEd7g?nII@113tOKM*P8z1B!+-C60AwI=I!62^gr&jznF z5S7*U*D3$#2t!sxgGuAdWZhq$j)YJh3u%TlL-uZxbD67SV0pBRcjMNj4W5c{n(KXQ zmXsJ*r;Lx5Aiu;py<+F}6ys#z#Cp!rS=zogxt&jO6RVh~`AoOw1Z+Lg)IZrW^9-$^ z(3uzd5net;%v~s9VXZH+W91*UsL4leND(a+hZ3HbsAmW`h04 zC9l89jJLFXacsJu@ka4m6%-Q)J%ctTmW1I}pRs|O(D_?Yp?B~}-eZGxXW37@Fiz+T zmhBjBtXxuW%p0d0pWbJeX^Jb`HtetkrFc6#mUnr(j{y^cDdvd*z>BK---RceKvM=_16!Z!LJ3y7U;o?g^)x3mVBv8!R!`&`lP&m-< z9Bek$iU&E|+WkXw*Ur5WMF+i-$Jcd>u50|HPx0dKA=k2z^9rVk4*zB^f3yu1!g_FA zth|>&Jzi1yV+eZZe!iOiZvCXtE!DA#B~kxoDq_R*vLEatVt(q^K)FX4h1k*iQ|@EK zz`@J?kdl}$_2;l>bnGUYZ5ZNXDvZD9IS0=eD=-bssPkW|A>tx_(*#m9TC0(zy}2opC@Cat_DQ=j9xN+ z$q>CIaPp{eNwbgq4is;zs%x1*0F|hQ6rq){f2qhOtGejgKc4D>aa4WS?RtaDhT#T z!K^?DH-n1N!h)5fw$l^?& z{vuXvf&!ZU*ut^_>*LH~tHHAHgqTDlmXIP+hVZl{Fi*SEY4A;iXWNM#jx|&&gjuCE zqsAKiTrFA#yMg;!@1UcGka)mOec zA@l0)uQsptSFe56j8W3SN={GRelxjjQ%syfRg;-bv=I4wGR=n66hpeHl*2|sRsx#g zLx~aFn!snbvo348g-_HRJwZqrz!MImDF)mWN~}Pr#il!@5fHU(70cicq|s-V-c0m!@anrw~Se5gCV5F`w5;;7pSqL5q(mv-qjw483gT z=F@68!U!43O|`#U6c3#|Dmal}ov*ikjo0?HtUMf`Kyr;1>-hMmP@AX2)0hge2t z=`u-{u~Wm3YBpkC#48oBEVFupVvsE^#}R0*SSG`b%ZUxwkrlj=1lbI?6`$Q!cF}FD z)1&1zc}#{0g%+0_{yMSzRV2jm9ARf9*(nlk`U;3?KKXR=->2BkNa3(#^+LhJ%x>9z zQ{CYzV|lI=pqp?GMBND5W67c0KmEc9RaLx$OlXY};^BZa>uMoN_VQukNoLSqHSI+l zhY1c?pqdV##(Q`FB0aErykNK1;TLr5c6A-Zo$Dml)WLujfz=FrVbnH~SS}bj&ZoPP zg^>(sr2+&a%bdEAeP48?&|7$1&5SVNrdGPO8_bd1CjOLq!^pN3i$+uBi3}&1PanLRyKZ*meX_3t`1dX;p0}>#`x6 zvL#o@m15}+k(2-b-=lH8fBlfYlKWrp-`UpwzV@{F{U?dO4^Hgfr)zuHkJ}e?_Phtk z-7++Z0So}n=;SQ~Fl(9qxXP;TxwnIuJ*)KKfd`)O7&)1@DDFIQp5vF#vc z`50r2RUKnGj;Rrk+MCFY8K#i77`{vd`X108^oI`+&3OFx0C8zm|%n^&+>sPUh2pHMlGVG_)Or z;0eJU?@h`%Kj$uI#0@=C(oR5FWr#}hWZajvZ8*34TEA-Rb7fc+eTFzrBgtoVGV58vyt0?6Jt6SW>xahu*#= zCXA~>W~C5h9>TvG=Rfwi?7ELMfB)V~uVm3E1M&x(Zgf;kwB9PR}>Vy|t_z<7fszG$O$_nuo* z*S)4ydMfK&4yG?y*mSrC5Z}yI)N8I&t=Lm1^I6xX4{J(A7C!|bKxlsSmAKmNho^%7 z^Z)dE+MAkQ4{gB7GJiS`%3U7p!9yd=8!sBdR(pHiB#_$!2J=Y1srF z)Exu*0sFh2^bI`y0)gR{y;n4+J&=FA-nlOCPWd# zvC~4=wTCWM`YA$Xj6ubFhlHAB&Ak9W34dn*8$x!R5!wlOqg&Q)XTF?i>6?XZCtSCF zy|cW;b$czS^pK=W6fg;GSMX!q^a1dj`Pp7KcxM&Dtxu{ecy&Ex`$_DVLIv*b7l9NA z4;3iD2V_Q@tP5zR^k*3e9kBusJ+KDJ#KbcODrqX_p>9-tXNA2%U|-;5b4#;nUXv=l zOeISw$feQh179Lz^94ZtiLWs;p@OPXihFx*Cxy4XrDv6s*~hG+MKJV(?E`(+^`F}> zmv#pSQOn2$Y!D0r3t$Fq_+S9n4vwwhL+9_tif$nEf~5^Nks-WBRqE5rq6(N8`tAcH zNc7TXUGoo89a#rVkc3kg1cW3Z@Bv(aD&Cs9u~h0cph$B=lgv{km2-lZ>x3}iZR&NN zG4&}PH&pyM&18i;?y*P~KLSJm{CqN2nBtiD%FtlghQJDllNam?C)!S2LC=~DhA!Ly zkH|jYdN<_h2J5M+hSlAm?%oD#CK_H@KM`v`p4#y8!0JRURqq}c;YuV{3;8zp0BDJ;(TRl)}tJ_Y1*#svHX6gy68cg zwt=?jR=W0chm2nCpQ%GtYxIO=&R0C+a^|k}`!~bf>=U}}LUZ5EciV)+*bm&_Y-P+i zzKSY+*oj9HX0d6z4{XIOwOTLvh3C_<-6*e55$5=il+@HOHGm)mO8!HspV{C^h*#2% ze|2d$=t)>2KmiQ!4m~$W`6936+-ta1)yYY6QqnecDxiys5A`5r5ML2Vv3m)N(^xM; zK!U6&bs{GuxB~!$zzt+Q?KMP1AB4Xrj1f9>!h15OF0+HBeELi{S+KCtx(?uUH79X7 z!v>X#ZJN6=oGN-OUyefIr1+dVt*e9kd?JPqCC`ouyomy+(|$&__3(bYW$1h+eMG{2 zfLi^!^MG=Cd6mHRt@UL@YeGEOx*>5mp0;^9WJua2mLMyeS>HXrTz!>00cxRZX5gWw zhYuc+?eWSMxD4%ut#BT@Guz-a^(S_KZ5}qZ!@7(ss{w8i^-ub_863FkZNO<>-gQm3 z%WF=hUt>_U(G;k)XbL9xY1#@d)n*leKd!n~>CWJ@pB6#w2Mp4UkmwaF0t!^qNIO3^ z{Cd~vDI!>9EDPX-cFcn(<(UOQ0iuix)B&`lhIor%&?R*F6+8{Zczhvt?IZ#K9*%7Iq^UNaVF#k=7*BfHYdJgtua?? zzT0|zo6tM;i^lIGgg}J-MjApInbLte$KbET8l*mFPb5-9j3EB;Joc4{z9g^Evpr zlKZ(kSonGD&)>oP!qha08pf}bKCknOcaX2q@I4Iti_%ZXR_V&$Q&K!@GEV&Xk==NM z)%b!BYjYA;^)@&tuO7#<=1gi$9jH3}1?PL5&+88bZv7S4S-XNg#pQecAJiNFf#bnH z!+QYutGh8zc<@|ku3ou(>EZ>?(>U~9+tgKAcrRuzev;VP!DO9qh7 z@((lDd}bw^vMmvjK+n~u*VybjMU#Bd{p^Bk^M)>;0V)g#o3+vUPgG7>6?zxZcaJY)=<>z!N!-%bo&TjirgJtBKiSMHFXP*3ta(BO1${8l&v%LMzTDAV_%q;IB_KTtDZ zm)Gw8ew789dTH1Qbyfd|r?D5>!rR|eZ-H(5n(9tF{WxQjZJMFLt)F5lt0+*HZ}b0P zy~}L@3Bx7MJ)M9b&pe5}<2~w^T zUy)MWz=#*@zf3Ed)DO&m>!=nQhe_6XhrkGNT90M1!h#K45L%|r3y z4RpWCR+hQc!W7VA%c%f>H@MT>GP)mwcjWhGvqGNcUVxUsxf`J#?0J&Is0O}Sh)CR$ ztZ75kxf6))7Nv7RN=bg{4&h~^fh zk~lBfbvJ>Q8*oB=Zqt&N-&iKlNig39!?O{ujgHNSkYkAXFnSlO1JgX=TmY7Jg+qnf zhFYkCAzdtVTnD|}Ybash1{md0IVhP>PNsPgbC7`rXi;q-xze#Wn>i;W4E@MIjm539E%XqGNNy)02nf)-hiRv?z(mL6?xAlICf6=1QX6v z@{L_DwD!%)8{HYVQ@0sLt!(S3PwnrKX10O0;9z~QQXWm0&J*q{JsGDu0Dij%UhW4` zF@i53N+Y8w0i8w}on#4&utt=Xv?yBWpc{MXYuEw$275NI3)QKq`P`|>soL?%-yrgD zgB~g6Tb5IRQwf1zgzL&}ys-0^?MBdetV-!Y#^|h?hSD2V= zW)ua>n9~USBbuM|%3zg0e6!q#f)xBXh(c;}Bq9Wi;hsT4*wEIPbT+Vtj zG6GwnuezQaXkBQiY2cYW<2dD!)gW)+$hQGgJ-x$vdd;fz3K@;ODPX0qh61X9)HfSS z#^5BKkP;TCK{fR;;vkZX_YTU4hES^pFig(CG>k$upxE4FE1UMN!t0AyIu=QEkE62A zDb%VUed(B9U?!vn3-sWI{%DW}MV5jy32I99JwPpDAhfzg*vmc$AMEu*2yb`dSq8Ew=m=Qq?VjNlMQ9u{fvh3lqXAtR%?s5!Q{PlQ&(!8`Si7wl zCg(CYrHx`s zhZiqG6FACc4VwJ-2@y=p>A1*D%QRRUz_o6>18s{t+XQsK@`Yj|YVS)bt4zsCceCO-89J<_p}N{M9| zXYUB0K20n4T)WcesLz|ARoH+1^PM!e;r`$&S|%16Py|vKn8xxtzu?jHGCCZy z5_UwI>6RPo$uf+FiomImNm`5U+@)4=yyWCAXMoeEVg;<3x7;N~6bFZFls1<>Y5`~} z;>cf3)6kmkJ+hN4H|3*7^jM>)xdkmkg}@CoCvp=x-l!t_P(w znz|r|Rdxk*;H^%2<*jI8g6rx}4Gva$K8d8XO1gA{V340Z3KCZ;0FT-PyV9XQkL|B| zcmBck0lv&;Z<666@`cw%5*{cYW)>d7Jo*egS$X0&6okp{@U@;X;f#UZFF4KTiD#6m zZb|k;-yGrG{BEu?&##h7Kh0Kl^8xnHek}N+oBfX87Jy&*{Hc^1x_#L`MfG__-`t-c zq^5*uM=6dQTJhArM#n*PFB5A?O-HQ2srIM)Ee+Jz)NF^ZVfD5DT=E6JR`Jg1iJu;< zfbaH4u{U7vM=AT8==;9^`TIC@&tn@MjGE!2AQEQ16=Q*o7QUn?iyq+N*L3xXYM!WR zPFs=ePt|j?^FgZ?#01(!0Q#G4#xEr7r7MUOVqf9YffG1&a;%&S@@-n?@D1g4Qt5mv z2g+IXDh%W_QW~(aQkMaG>?lKi<++HBXB3`v5BoK(W8f!?02s~o0*oHwfn(l<@*=%n-~$zd?Rj0T02<``e)Wz)SP6^idn1nCEcZZzM(cs7!ZN9Ex zVcK_AJ}m35BAJ+Oez-M_(5>p532L5Hx5zSY%n&r`+lu>%@t8DNIj!$6AHz#~mg2cKf1I0}V`7cP&8Ifs9C19Kj@=0l>-nME@Ko2Qo0d*XtV_L=?_R|2yu;VIaAH7p zQyXCRe0UfL7;0Hh=eoYv+X>SRecHz4Vh~FV@s~^v8cq*b?L+GtN_o=}`$#X_E33ef z{K)MCEdsKU4U_4N?oiyfegvlQq=4kPwU6Q+5;927?nRMLsO?KSkH-`V|-qHtw;}08Y7>+f~f;mvPt)eCjc&Y27MHY5rqcT zMkcXMHqtsVltb^c=hI6QVVHEb0*TJqtYn zv}gQ6T6x_QVI`G<>KdLUJ@*ZHp(m23+YWz?9v}FzPgv_NJd>HFPJoB2GIym?ZW23c zFmZs5C&BZz=Axz5V~)mn5@U<|7}W3jwbo%PY&tntZgrnaz|HZMe8x5LIRA#0brh@x!i z0;(7%d|;YZmNZfgW|%$M&(3XOUDj)cb$t~YciX#!Rh!HJixam2ym9xB(M3DJP@{RE z$NGRS12dL^Uh`rDTth#w?umAV0AS)KXxR3ii)pSj{}7Dl6PKlyZavbPm$hRRyv79C z7rZO7W9tTjmmgt)xscGqw3g|}N8Rc_4$R3K(m}R4^Pvu;TyDG}?SKYD^MGPuB?D%h zvXyC`Lv<2>cj}pQ(VGsj|p>ruDCL4$Ig4nH!wMF1Y=v{Bv8W47l`k-fZYu z4Rp2Fgs0fif7~Z?o288dwP|i2BgB=djuX{V>PG-27v!O-CWxoQljy%7Y`?~uIG0{)k%MzOjJ91mUd?Sfuo@+e;}7>TMNicb;*|+?e

    P}59*%~bzZMKf$K=Mg)s&)64a z>ibxEWcd$JYruA|{W4=>=gBNL{cawBfWm-aF?zCOEY~28f1MCbe~pu~{0_}^>VpRY zv?!p26iOCDG^oKCY=yjqT@)E4=F}35=mYW@o99@iYD@+65S2cR<5cYHu%Y=h$oi5I!As&hvuT4eT=D{vUz1{k{D zz@n-B>wHX}LC1nGTeG>dgayFYC#*Ts4Okonvj}{4QoB#%pa)hMGzucW-%p_OE?0-v zW%=HmQg7@q4@R3o7gz+%%fYOI)(=o0>u2h)wyGT)Alc5y2fdN{ZtV!8GUHuLqE%x+QnROlw|vY$siUg zNUKwrP}yTJacG-%XbRNL*wn`b^k@MslM@C7!QYlM5PhA|w_$3Bu6@o8n!0b;Zb)Ox zcE&BoE%HVH=OxWf>_7&gIYVCFVYZ-yC&VA_V}alFCAVz9IxMrqWx2}BB)El-x(Sm{ zLkJcbdhPla2?k|{#|n}XsJIGhuBLu}XUsqg)bc`@GuagQi+@?^;|Fm;6I~H&3GxA1 zwrgw(GV~5fHd_b5?38Z@xDp^-|E~DzC zVjB#xA6PR}+(}@ZHEe~^cO#FuZJ}v)6EmcE9X3X55Yl6YQ2hK3>y$z_S@xLzBL)ia z_VOmo_Q@7x=o8|sLsJ*f?1a*rH}1hN*2P)uS!Fy4l{Ub$jyR6fF_WRHR`NeGJX#W# z8z`VbGVBn?;ioRCi)kM+Lbs0H-34IXN5n@3pXzwxiRy~yOLe}>&cI100Ny0Ezsn7V z%d4!?=gDh0jRovOt_m(Ah63RCE1PHC6IX z8EM+7ZI0?LgR5~IfC?;W%K}fynDun)U7(xSj&E6dY@4O!d!`GHc^bKppH*xhvg=*( z%RVcNRZz!{AJBc6P6ebWnl8QA8DH)aw{C54@m@4NY4g>pT`P&bcfNNqS-hRIFji}2 z1~$U*M-tohI2f!OVQ677UAr4|Lhk==Xt1ht#aFr_=3Ti49u>S1Y(Dez;pt4LyKgQ3 z1;56);CvQ7{qODNgs`FhUG@5V`c2X~>@!DulW^2B;D050-M7 zXb^JXnR-qXVjb@GQSYZDD@wZ1Jj_kJK~Uw*+4Z#F#j{^M^U;S0Lfe0r)^b08b*b}1fxY}T&E4O#A8>^Hbfc3^cxw`3A|$DC@~ z=bRsSTF)nbKs+Mesmp_>Yxcxf@DnNkYd(OnfqjFVvl(F(i+9`nnw09_Nx#IM+In3u z`FC?s@D*cDU%FucUjAY+wZVpC1e@c&8`y(C`cMD?1OhYkTTvli>bJoOdm0RGT^y_` z(Q#)|2JgrKY2H17pe0tRhvi7pje8RPDg1@l0_DwIRTX6*H;MB=R^7_7FEe2v$-bWR zI1hD&6RXv(xe(L-2TX=V&5+eMa;b#JLLpy+Ckmv8YSmaOQvA!SnW>`eK%cu52EJfZ znU_+4`Rg0EC5hMhSlq$c07ZHBL7M;Y}R7wKLz-EpsP_gl17%1w%qC zQ^AUeLH+ph--IF#a61!?4sq{L!P^oV7SL42ysUfT{U9BZ?s5zM=S>2dgmz;wFK(VX zQqJBL6k^}*vmUY^PnR4%a6*e%UY{x(SPug(J*23as0Y-b`~Xu6VSto){iPd!(r`Br z-Un%>=y6y#d1IB{C9AC4GjVELEf`a7E#*vPKr!pl`8eDZ@p?dD34@&)Z%hv-ZfR;g zkq)%1X>kdLRWJz@1M?G2hK;rZE71dbE;XP{u?;Yg<~%@JKfZ?d>L^(rZSM6Vk{Ly2vV~HwU!ij9?EEy znAjP1MTAE3OL%LLCo-fcZpc=CtrEm&HI!*iYAgfEAC{s80TH(Tb2ZTWF?|=&csQvG z0k_t^K?djETBefNX>y(SL2tn^rW(Zuc>9_4*5SQi6-Llno|lBJ8@BPn{+^i%aX9usV;k;G;4!xC<_hDqFMTnaLacHbSiWRO+HBDA6^OX8)*-zA^iDJcY zu!xTSGz2p~i3%WW}E3x6Frdqbf+%5I``RHfq`G8J9;=cpA4J=vLz=p#i&QYpJ zsDx`|=;U>`TNTduD*vd*s^GO`YS`Lyf_DAv6%(z}X|KJ12^vdrcx|c%tZFp&32l}r zWaKpUc-||o)eVxcBwnQX%pII@hXAjQKNjr`2*x*?MOaL+W(~=m^ELMuUgbp-+HOv9 zy42G->RIL=Op8`~i?EiD4kpcX7cvtx;XYwy_~hfTaGTNQJ9GqY{E`x8YzXEHESTRF z-zE>KdyMbc5=itinR}WX0Ys{2|9Q8jgy<)sDE+SFhZE?LKKchI z6VB^hS-uj(kdK&co%eXG!Jn^$_FrOOFjc3#iR&gn-758Hg<9~<|l?fer?PE?kDv2VB2du8j1vaj-|WdQ}6ehg<~ z(k|0ViUc2@vR@aj-W-b#gLvO3lD^fAQ(V%ao)kU+=({aLg?bGB65kye(}T+tfEpDm%MjTd(c=&TXv>m4jCHqt9G0~O@|Q}P{|sVgSKlD zeJO(z4Lxx%gK7fen>F5tPm-zT@QE5}luR{}2k=RA3;<1kf&PYXjhV|IVb4{`CXR4< z=Xw8Vba4Sjub40W%P)PUiB_@vj07JrHr8c$>6((C?JYmC=o#_oNAz`w$F^9WfYcDe zfP4nr2-FE-F{#X9PKO^P7|}sLJ>$qP2{S1sDn^2h*6bm02{cEryI$u|u~aDfa4+$t zVJ?fsPJWh1Wl{1~RPJj%R09ga>|l?p)iJWw5wmN*RXzey7uF)lbXPG(q5bj#Krk&Yd@=d1?n! z-<3^9U4@l1N+Z3Y_^RIiVqd+TUPryH*_L|iQdc^9t%|jJHB-|y1ywkC`82OyF2U(q zY&4`xsGz#k-R-jxg@Hv30>%%c5W*b7ejF~J$dOph#W)3DOczK};FCUVga|_r1ah^M zk)qk47$gGOw{62vlv7cXHvg-#9b>9ij46|4D!LWIMxaKiAjO&Gkd$`4%tqA4J7Fod z7s!EWuYlF~P{1y554cQLJKGD|RJ~>RLZ*82JHfhMlTv!h}T#CJ~M zT_jZm&t)LEGAOmRQ2`@L7SYopeRRvvKZ?nTw_OtdG#$8l#0|43Q#G+JWON^V#Hpc# z1iO?Y5iLYm17?eP95!s2;Rc0}TG&Jpre20zw?P~yP7z@S(de(WmgU6NN;zdQ)8fyc z(uD=vRullb^C9rO{_=vZ%bvFfx@31@MCX%|{yVW4EEGlvGglNrcI@pa6ToqVeJ9P6 zc*RTz@q`bXn@Lrfq$ni)dby&Q8|Eb}HVQS8RmQ9jPiLa+GAd4&o1~-_OFld!L>r=M z{yqcG zIq9JLkN4-;UhOR+Yv25)%EyFUrhpyVuz*Zc$#ZdyU;6xv7*c+Ae2uZ9?}>N*3#M`U zSwOd9om|ZUyJ2N)#X96u%8n;+-YwW2sOF^j74;z7vt%hX>~SYlE!r-O?k%Ew@BALZ9|Io{CNY}beHpmWrF zVrtCFWFAp1MycZ6Mxx(4hye)&u_NI0Qy-iWE(muIUz=pt%a!j~KcPu$rUPSTADI8T zJ>6jU^wd=P5q6HMQY_Fe%e8>@^`Q-W4os{aSg?%XqL_IB?^c=@x&r`xXvU@@c~N4y zMO1C#mKx}4bT6nzGArpu%v|TLXRbq6fWuXMK?%FYl1MQ(^DGdhK}uJbacO2I}EyGWkqT1D7GQLhzTgOZ@g6I)|~*>YUCk z?-l(DJ_*W6Y;~cF{U}v11-0B1oYsanLao_-yqEV51S*y~;e9%gidqFchYTg}jns+_ zLA`vhzDswrY;Z;oRLhVCC45KkVIY~67BF@6LlO^tj_;!eTaxWMOwFbnpjFY`h)HU^ z7#&CLUm<<0l4UzN(gBBG#gXZy@ar~D{DAg}cJ#n=9s(CiV&Z5*CSdcGbYK`(Zq=5l zES(TG-V1GZ`;};23A--Hk9JRhOyUyl9!|eY5O%xKfGGoCkRUaeiE=@PT5H)BHNBT4 za=6}K2|+Pr!IksAv>c&1m~AYXFlnM3T;{_vMW4E&FCp~@r614Dmq-_OMKgpEIX=0< zcqZ^8%Z!>viWtcF*8Ttg?Q#NtC73Vm^2rv?NW5ITR_EV^rl{ei}}ir z91uP~n|6bAP^U3{6(Z3kcmaVm!S6@Oi6CS)!!vz6fI}SU!&XwQ;G|j`U2kN|*Z_dN z#yeZs&(gW~tL$l@518B?$!tzY#rR)S@2(`@Kt)UG>!O6eSso=5=l;NC^|ZCncyaor zOV5vP>jfq}dwTBT(~%J@`g{H!xQ3iyy1mwTzSx}N`F&#( zsxy^Wtn4hAkYSJYFxf zXfqkTA7m!v{Ui~Pb_rb+r8I*@APz+pP!Z={8ZyvAYXw1+JJCj z41-`F%o(h$p2*2XzxO?PKd|CRJ2@QnURq2c4o8LcB_?UJ8%fp-a#&WibfBfA)I32fJYQ{c$F1 zx@WVSg%I_A(4nH^SUpGQA59Xe;*R?v?(7_Xx%7T4=!f>@iyL;vo6^L_--AIxISKMX3P(qmg)ncemb_YSVGCLLhs?G!d?+7k&R_R= zCL^!)qiGlbqZ^||wzsDzy{?D%IBgW8VZ6Eq&Iu0_e*e-vOw$azX?+$?mrW;>ROiwF zVZZ#s6`IU=>q6`o4^O+5Orw1a>|K}2zD!D+C!&q#K(AQp`SyYl^cjU`WcUWCOlTVhJC+REO#|t05WuPPQA@BzJzyABiNl!5L z@v)mPr9AYiNza+jf7Z;*85z&IuYB)V$`)N$|JZ(<)-@R|Omx<@)O_jmmYJN%*Nn=h z$%CrPOc?Z-zSL+)Xmi}LsqJSOHXs-LjAT6Vgqu)kCwN6mrS~>=M5(O& zomN`CXQ;F?-;_wOLPiv%W5X3`V3qw>#rdIk@z9NS=|I~79tt??-6!b-;skkUNHqh0 zNsF>Ibs)MclQ6)J=`joxn5NAGgT&MJfGo2n=>oiQcNn+P6lB#ynvAvB6?B>PmYl|$ z_C84K!1;YHf`q36%`z{|TwrRuaXMxhN`yC}go34kFgoN2r;Tz2*%t;@nKw)37K;nC z)7rw>FfN1J`(fNUK|jnr+sEM@mSbtjAK>K}t#k#9w^LV;?zUr2F*$2q1VQ`sl@L0N zVJiVl4u(yWuA;W+pz5S729Qmb79)tZ5rh)yRo(KIDusRV_Tlg={AR=d92`U1ncjKjl7=+&`i7DTh|3C`C_ zWZVRu0*eLKpkY?t+Snw++G9kuT0{!aQVahLi4 zqi|jdCW=XdREf&7O3|Q7bW&M&yrGW7b~-%S2i2%}M9InG0Tri;d(=DEJYzdFV9oso z_}YwJrW~*AqwHadcF z%*P$uvM$?NpoXfgD>g$;P3G5nYqs|2WG9R7B*&jpG4%`8BLcJc9uRH`J8$c1!)_GgbJ0G#%%Q$Z|q^rDOafg_~0QFSo&dJ4*NXKxY?$=n1(m9yP!H)F!Oh^`YYl5B=i%d%?jlyMdj?Y2G=_eJRXTq0!7_r2C1}OMjk02q-_Z7k~u*MtfOF-}cACNG# zR{@7g!}_|;RFT0u;FA%0en;c6rPNX#9rH{5Sv3F81wZrxND2D}#`8rZk|4cEJ7`|t zGxGtrPG8$Ax`5oi6Q~UHI#HR{N~$k#-u=u;d5Jwh77^UAIe&3E>2Fbb>8k?kIJljV z%Z&2KhPo&Ew?Psf0y@NJh{!V&_qSmXffWZvs0x~Y+tz>%tcZ(mnO{yl$N~^)$mZ?x6 zCF)BgVfm7X3H;GV0DTN0MrO^;YHHnhN9i0 zEG@7ECHzXSw9aaBS}QZ&%qe|k(P{?1WiQq?am;qj4km8!Xr}ydM1daYAZk>Ig$ctx zfdjZ=K}Zd31$SgSwAfCPPtCkL_50X~CI#iza!Ohdp*3h+w1bIlVV^3{)#^dsK}0xu zMHL(@i}rCfkv6V;HF$CZS?;&l$E{rpZ(I0-wTL5D!ktIdTR;UuROq}m*NG@PPZ$Nb&RSQmTJ~vNKdXO0%|e3=W#iCjS+Rzk)7*%mN4+DTbMf% zBd_modP;A+P{HKZbN&z5Vgy>Y07uJEj3>Xxasd8h zK7_`Gr!{0oxil$0eY!SYl+inhhQhy|M50-`;Jn!)R1WOCa$PX~3fcJ2(%C(ZmhSzr z`Nvl=Zlk2zE%TyvYd;VGfPnw}bNBz>#1FrxQ7Xbu0Py`QBaVTh|7PA@d-HZvdysyp zQ~?yh0HgRH=bE@MRqLM&X6@ad5Zb{!$gzo^YReg(mA{t(RmlZljb_NEY=-~sk=W$O-79V{Q@fg;$!u}!QFot6CZHf#PsOVgl|ou)$qX;%XRMz4sU zJw?*#8OSB<*&Gnd7Rotq!}#adQWm&pky9%NY$TB|WYl_~|t_BGCF5%zK+JH$88csk-(uWWV@Wxe7D_PpepulVdq6c7Dl& zJ{^#6Xsjaf+F7FgqumxZ9>{#nu{O18G;pE4mfOYZ!b(-%6HkD<^gS%V*wWTg)E6#L zKiMd6=8JekVhSesS69lsjY8T1feM*JAiX#+tm0Reg3|Z%#D<^&L_R3}9!|IgTZ>4loJMT3d17B4_|HCu+yp1;h=ca!bPjLgd5TNE<8vY_phD^FBP9U z;Z4%gFMJ8M)-U_vX&qT1@V736aJTYc>^Uqegw4Kz-G2KG2f@~RE^J3cb|gv!6uX!& zR_&Vrwg;HTs#N-m=6}VK(FsXxSbA5-(cTY6*OX%RsdGn(!cH@15`rqfKJChSL@1ZQ zlIIRDG&GE^&efWIK2lk)<(-AQ>GdZKlKmC?ge2qr!YeX=>$L7Noz5g6zw~wJ6~BYY zB#lzGN!pVpYmk92G@x-k$i(2+Eid1+P`$#ge=(HL##27>GUsb@KI#9Z+T}^0gkBcL zb=@aq--S!4c?AG7$@I2U*3~E3v}S*+_1q220zJhPM6`{hQm-lJqWVn8rE-^fzv7n-C z;{4;O5Gd)kj4W=_&tchQr@tetpcJM3PZ8-kr-&PedF+QQHYQ-qF;yw`PA&(g7 ziAq$`N{(`jc;AyCu~I5s8A+A(8ULzW<*OjMidCwz6i-#fGX{CCs#Pnl_!6W_E3u@K zOHo}7-zwEbrB&SyL=v9B=?yUYKLi(_keHO5lA4yDk(rg9lbe@cK(Okz$Og0NzQkVW zkF!bs2=4E(j9ec26BNf@oW;GKdaJIfg=k)8mzN(lZi<+A7>`J#1$4Ac+n8y)c*3ls zd8cHte}=>Dnl5?qOw!I|Q>U5I#3tF!bjNA$2Ri6DxhYNU)HI*-pQe{#A)T9{OwFW= zjAoI^OlD}d=4h_wX+B+bYk?MOkrvavhwrq63tFmW^lUj_YK2y6l~!ww)+$Tuv|byu zQJa*l&Dx@^+NSN=p`F^L-P)tQ+Nb?Gpo2Q3!#YAPm)Xn~9n~=%*9kTmi~s@&BA5_D z2_u{cB6U)ybebG~<9Ghh8J*QRES<*>AEJonAU1YV+zzfXA1@sEQ=6|YaDtQ6>mnX_ z(n}w;)NzgLoZ>WRILm$xaEXtw9Yj{|1G+XUXx7QE-*iZf3Fa6qY{oWtmBnx}Pg}$$B-Eh4Aq?DH9N}S+~<}@$$ zGENAd6g=YyA~@b2b?m>^XJ`MNhaF$26q@;UlhySg-FxmS{SeCDio+h;vP@5&>Y<*k zg}YRJPsFDm$S2Ycpd0wa_a^Y?&;@xk6xp|jMh59n=okwr%6xnjyer$x%F>VOjp}Ja zd$hvVWXg`%RA{Epc3L;igWzBSQilvuOEYuLORbfNzS`nET@!D2bgg?WB(CPkbkj|~ z%!6)CrrQ&^_2gEozg?xeUHt3Tn6XQ&+qEvW?#I;bpFOzuDYl+x&#g8hA<-U}SdTcG z&z2|Blbl-*pZ0s76JnXSW$*+Ua@*ATnKz^ly_!~8Xd#n+CkRE`VO>A$iOm-i>TYm7 z61l@IXl@p>d_uFg{)!pR%&EZnlk10LcZ0M$ju|s@7t^4-#`lA*wJ1MaFPmrPJH}La z(X6|>ao*7hyX3O{+ksmwcpS~Q0RY|8Jmn|&RcIKniRCqaxYUz*YDXI6Gth!12lSx! z-K_o%^?*vcc8_kxTB_llC|`Wj6o?tDsS-E%+Xor~_`4wmdV zaoyz79CYX?iXzZqz<>rF4ItL2J##4j7jjD5M-(Gs261q?V_z+Hce8${Swm zeYg)#Yfi+5SC`ttxQm>NQ)?Y9G@537Xc;X8MKjLa^JOLV0ZOYg=dX?uOYP{_pJ#7T zBoN5FiDhbCM@+oC8vJS71DlnBNykQKKw@pGBkDCNbZ85fu1hOpi@@nRuQIg=rmhQ= zN0aRO1IgYOT6WvMa@9GhTwNICHg>X??^+-jzFsG1bubExGsZn##<9O7w09wG2j2ML zL=oXwmX^N1ub#jZsT#();1}oiilmez*7G8`4t{p~Zh#~E;K}%T_^x`cYW@kg+Fe(q zWnJG9h;ko%;4jX4ACP#oMfv)~kJDBq_w#pJ?&$=zCT=Ip{(kp*UwG%b)mi0Y;i1B7 z0_v1YosWKs1r6RU*@Cv$vChj~&>}rwcI};u@Gu%&FUQNo2R*nlkuATx|5X&&_@3Ju fQ*kHIZh4pgL`8Ft*_IwR`{D1~(|2fa761SMsr`6q literal 0 HcmV?d00001 diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index e0a16fcec5..8880e2bb2c 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -174,7 +174,7 @@ border-style: solid; border-color: rgba(0, 0, 0, 0.87); border-width: 1px; - border-radius: 6px; + border-radius: 3px; box-shadow: none; padding: .6rem .8rem; background: none; @@ -184,14 +184,14 @@ border-style: solid; border-color: rgba(0, 0, 0, 0.87); border-width: 1px; - border-radius: 6px; + border-radius: 3px; box-shadow: none; padding: .6rem .8rem; /*background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.1), rgb(0 114 255 / 1%), rgba(0, 42, 255, 0.05));*/ } .md-typeset iframe { - border-radius: 6px; + border-radius: 3px; } [dir=ltr] .md-typeset :is(.admonition,details) blockquote { @@ -260,6 +260,8 @@ @media screen and (min-width: 76.1875em) { .md-header__topic:first-child { font-size: 24px; + top: 2px; + left: 1px; /*font-family: Poppins, metro-web, Metro, -apple-system, "system-ui", "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", sans-serif;*/ /*font-weight: 500;*/ } @@ -329,84 +331,19 @@ } @font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 500; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLGT9Z11lFd2JQEl8qw.woff2) format('woff2'); - unicode-range: U+0900-097F, U+1CD0-1CF6, U+1CF8-1CF9, U+200C-200D, U+20A8, U+20B9, U+25CC, U+A830-A839, U+A8E0-A8FB; -} -/* latin-ext */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 500; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLGT9Z1JlFd2JQEl8qw.woff2) format('woff2'); - unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; -} -/* latin */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 500; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLGT9Z1xlFd2JQEk.woff2) format('woff2'); - unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; -} -/* devanagari */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 600; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLEj6Z11lFd2JQEl8qw.woff2) format('woff2'); - unicode-range: U+0900-097F, U+1CD0-1CF6, U+1CF8-1CF9, U+200C-200D, U+20A8, U+20B9, U+25CC, U+A830-A839, U+A8E0-A8FB; -} -/* latin-ext */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 600; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLEj6Z1JlFd2JQEl8qw.woff2) format('woff2'); - unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; -} -/* latin */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 600; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLEj6Z1xlFd2JQEk.woff2) format('woff2'); - unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; -} -/* devanagari */ -@font-face { - font-family: 'Poppins'; - font-style: normal; - font-weight: 700; - font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLCz7Z11lFd2JQEl8qw.woff2) format('woff2'); - unicode-range: U+0900-097F, U+1CD0-1CF6, U+1CF8-1CF9, U+200C-200D, U+20A8, U+20B9, U+25CC, U+A830-A839, U+A8E0-A8FB; -} -/* latin-ext */ -@font-face { - font-family: 'Poppins'; + font-family: 'Geist Pixel Square'; font-style: normal; - font-weight: 700; + font-weight: 400; font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLCz7Z1JlFd2JQEl8qw.woff2) format('woff2'); - unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF; + src: url(../fonts/GeistPixel-Square.woff2) format('woff2'); } -/* latin */ + @font-face { - font-family: 'Poppins'; + font-family: 'Geist Pixel Circle'; font-style: normal; - font-weight: 700; + font-weight: 400; font-display: swap; - src: url(https://fonts.gstatic.com/s/poppins/v20/pxiByp8kv8JHgFVrLCz7Z1xlFd2JQEk.woff2) format('woff2'); - unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD; + src: url(../fonts/GeistPixel-Circle.woff2) format('woff2'); } /* latin */ @@ -519,7 +456,7 @@ h4.doc-heading { border-radius: 3px; font-size: 15px; - /*border-radius: 6px;*/ + /*border-radius: 3px;*/ /*border-top: 1px solid #dce0e6;*/ /*background-color: rgba(0,0,0,.87);*/ /*padding: 15px 20px;*/ @@ -1115,7 +1052,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { @media screen and (min-width: 76.1875em) { .md-typeset .tabbed-block > .highlight:first-child > pre > code, .md-typeset .tabbed-block > pre:first-child > code { - border-radius: 6px; + border-radius: 3px; } } @@ -1135,8 +1072,8 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { background: none; z-index: 1; padding: 5px; - border-radius: 6px; - border: 1px solid black; + border-radius: 3px; + border: 1px dotted black; bottom: -0.7px; top: -0.7px; left: -0.7px; @@ -1172,7 +1109,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { height: 100%; background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.025), rgb(0 114 255 / 0.25%), rgba(0, 42, 255, 0.0125)); z-index: 1; - border-radius: 6px; + border-radius: 3px; border: 0.5px solid rgba(0,0,0, 0.5); overflow: unset; } @@ -1213,7 +1150,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { display: block; margin: 0; padding: 1rem 1.4rem; - border-radius: 6px; + border-radius: 3px; border: rgba(0,0,0,0.6) 0.5px solid; } @@ -1686,7 +1623,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { border: none; color: var(--md-default-fg-color); padding: 8px 25px; - border-radius: 6px; + border-radius: 3px; background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.1), rgb(0 114 255 / 1%), rgba(0, 42, 255, 0.05)); } diff --git a/docs/assets/stylesheets/landing.css b/docs/assets/stylesheets/landing.css index 6efd82de9d..f364a73580 100644 --- a/docs/assets/stylesheets/landing.css +++ b/docs/assets/stylesheets/landing.css @@ -66,7 +66,8 @@ @media screen and (max-width: 76.1875em) { .tx-landing h1 { - font-size: 1.4rem; + font-size: 1.9rem; + margin: 24px -24px; } .tx-landing__hero_text { @@ -110,9 +111,9 @@ } .tx-landing__hero_text h1 { - font-size: 3rem; + font-size: 3.2rem; max-width: 36rem; - line-height: 1.2; + line-height: 1.1; } .tx-landing__hero_text p { @@ -241,7 +242,7 @@ .md-typeset .md-button { margin-top: 0.6rem; margin-bottom: 1.5rem; - font-size: 19px; + font-size: 20px; font-weight: 400 !important; text-align: center; border-radius: 3px; @@ -296,7 +297,6 @@ [data-md-color-primary=white] .md-header__buttons .md-button--primary, [data-md-color-primary=white].md-header__buttons .md-button--primary:hover, [data-md-color-primary=white] .md-typeset .md-button--primary, [data-md-color-primary=white] .md-typeset .md-button--primary:hover { background: rgba(0, 0, 0, 0.87); - border: 1.5px solid rgba(0, 0, 0, 0.87); border-radius: 3px; font-weight: 400 !important; /*margin-right: 10px;*/ @@ -353,7 +353,7 @@ .md-typeset .md-button-secondary:focus { background: transparent; color: rgba(0, 0, 0, 0.87); - border: 1px solid rgba(0, 0, 0, 0.87); + border: 0.5px solid rgba(0, 0, 0, 0.87); border-radius: 3px; } @@ -770,7 +770,7 @@ } .tx-landing__major_feature h2 { - font-size: 1.7em; + font-size: 2em; max-width: 500px; margin-top: 0; margin-bottom: 1.5em; @@ -778,6 +778,7 @@ -webkit-background-clip: text; -webkit-text-fill-color: transparent; /*letter-spacing: -1.5px;*/ + line-height: 1.1; } .tx-landing__major_feature { @@ -1135,3 +1136,28 @@ border: 0; } } + +.tx-landing h1, +.tx-landing h2, +.tx-landing h3, +.tx-landing h4, +.tx-landing h5, +.tx-landing h6 { + font-family: 'Geist Pixel Circle', var(--md-text-font-family); +} + +.tx-faq__item-title { + font-family: 'Geist Pixel Square', var(--md-text-font-family); +} + +.md-header__title { + font-family: 'Geist Pixel Square', var(--md-text-font-family); +} + +.md-header__buttons .md-button { + font-family: 'Geist Pixel Square', var(--md-text-font-family); +} + +.tx-landing .md-button { + font-family: 'Geist Pixel Square', var(--md-text-font-family); +} diff --git a/docs/assets/stylesheets/pricing.css b/docs/assets/stylesheets/pricing.css index b358d78609..93ba7484f0 100644 --- a/docs/assets/stylesheets/pricing.css +++ b/docs/assets/stylesheets/pricing.css @@ -143,7 +143,7 @@ position: relative; padding-right: 40px; color: #2A292D; - font-size: 0.85rem; + font-size: 1rem; font-weight: 800; line-height: 1.33; cursor: pointer; diff --git a/docs/overrides/home.html b/docs/overrides/home.html index 7cebed7b6a..36a9326230 100644 --- a/docs/overrides/home.html +++ b/docs/overrides/home.html @@ -50,12 +50,12 @@
    -

    The orchestration layer for modern ML teams

    +

    The new GPU-native orchestration

    - dstack provides ML teams with a unified control plane for GPU provisioning and orchestration - across cloud, Kubernetes, and on-prem. It streamlines development, training, and inference — reducing costs 3–7x and - preventing lock-in. + dstack is a GPU-native orchestration built for modern AI teams. It simplifies + GPU provisioning and workload management across clouds, Kubernetes, and on-prem — + through a single unified control plane.

    @@ -82,22 +82,21 @@

    The orchestration layer for modern ML teams

    - One control plane for GPUs across cloud, Kubernetes, and on-prem. + Reduce GPU costs by 3–7× and eliminate vendor lock-in.

  • -

    An open platform for GPU orchestration

    +

    A unified control plane for GPU orchestration

    - Managing AI infrastructure requires efficient GPU orchestration, whether workloads run - on a single GPU cloud, across multiple GPU providers, or on-prem clusters. + Managing AI infrastructure requires efficient GPU orchestration tightly integrated with open-source training and + inference frameworks.

    - dstack provides an open stack for GPU orchestration that streamlines development, training, - and inference, and can be used with any hardware, open-source tools, and frameworks. + dstack provides a unified control plane—so workloads stay portable, reproducible, and infrastructure remains interchangeable.

    @@ -173,22 +172,21 @@

    An open platform for GPU orchestration

    @@ -201,35 +199,32 @@

    Native integration with GPU clouds

    Easy to use with on-prem clusters

    - For provisioned Kubernetes clusters, connect them to dstack using the Kubernetes backend. - If you run vanilla bare-metal servers or VMs without Kubernetes, use SSH fleets - instead. + If you already run Kubernetes on-prem, connect your cluster to dstack using the Kubernetes backend. +

    + +

    + For bare-metal servers or VMs without Kubernetes, use SSH fleets to orchestrate GPUs directly.

    - + SSH fleets - + - + Kubernetes - +

    - -

    - Either way, connecting existing on-prem clusters to dstack takes just minutes. -

    @@ -250,18 +245,19 @@

    Easy to use with on-prem clusters

    Dev environments

    - Before training or deployment, ML engineers explore and debug their code. + Before training or deploying models, ML engineers need interactive GPU access to experiment and debug.

    -

    dstack's dev environments make it easy to connect your - desktop IDE to powerful cloud or on-prem GPUs—streamlining the entire development loop. +

    dstack's dev environments let you connect desktop IDEs such as VS + Code, Cursor, and + Windsurf directly to cloud or on-prem GPUs.

    + class="md-button md-button-secondary"> Dev environments - +

    @@ -274,18 +270,19 @@

    Dev environments

    Single-node & distributed tasks

    - Move from single-instance experiments to multi-node distributed training without friction. dstack lets you define complex jobs with a simple configuration, - handling the scheduling and orchestration for you. + Run training or batch workloads on a single GPU, or scale to multi-GPU and multi-node clusters using simple task configurations. + dstack automates cluster provisioning, resource allocation, and job scheduling.

    -

    This allows your team to focus on research while ensuring that expensive cluster resources are utilized efficiently.

    +

    + During execution, dstack reports GPU utilization, memory usage, and GPU health metrics for each job. +

    + class="md-button md-button-secondary"> Tasks - +

    @@ -308,23 +305,27 @@

    Single-node & distributed tasks

    Scalable model inference

    - With dstack, you can easily deploy any model as a secure, - auto-scaling OpenAI-compatible endpoint, all while using your custom code, Docker image, and - serving framework. + With dstack, you can deploy models as secure, + auto-scaling, OpenAI-compatible endpoints, integrating with top open-source serving frameworks. +

    + +

    + dstack supports disaggregated prefill/decode and cache-aware routing, providing + production-grade, optimized inference.

    + class="md-button md-button-secondary"> Services - + - +

    @@ -332,7 +333,7 @@

    Scalable model inference

    -

    Loved by world-class ML teams

    +

    Loved by world-class AI teams

    @@ -617,7 +618,7 @@

    dstack Sky

    @@ -641,7 +642,7 @@

    dstack Enterprise

    diff --git a/docs/overrides/main.html b/docs/overrides/main.html index 6b2f47dbc0..dc800b5945 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -1,5 +1,10 @@ {% extends "base.html" %} +{% block extrahead %} + + +{% endblock %} + {% block container %}
    {% if "navigation.path" in features %} From f7a977d83741a5df3a97b91789d8f73aa831089d Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Wed, 18 Feb 2026 15:06:13 +0500 Subject: [PATCH 144/187] Implement pipeline tasks (#3581) * Rename tasks/ to scheduled_tasks/ * Support pipeline tasks * Stop workers on pipeline shutdown * Add pipeline draining * Use returning instead of rowcount for heartbeat * Add ComputeGroupPipeline and PlacementGroupPipeline * Add TestComputeGroupWorker * Add TestPlacementGroupWorker * Add DSTACK_FF_PIPELINE_PROCESSING_ENABLED * Fixes * Rename scheduled_tasks tests * Add TestHeartbeater * Split pipeline migration in two * Add pipeline indexes for compute and placement groups * Make PipelineItem a dataclass --- .../_internal/core/models/compute_groups.py | 7 + src/dstack/_internal/server/app.py | 18 +- .../_internal/server/background/__init__.py | 142 -------- .../background/pipeline_tasks/__init__.py | 69 ++++ .../server/background/pipeline_tasks/base.py | 344 ++++++++++++++++++ .../pipeline_tasks/compute_groups.py | 335 +++++++++++++++++ .../pipeline_tasks/placement_groups.py | 263 +++++++++++++ .../background/scheduled_tasks/__init__.py | 159 ++++++++ .../{tasks => scheduled_tasks}/common.py | 0 .../compute_groups.py} | 0 .../events.py} | 0 .../fleets.py} | 0 .../gateways.py} | 0 .../idle_volumes.py} | 0 .../instances.py} | 2 +- .../metrics.py} | 0 .../placement_groups.py} | 0 .../probes.py} | 0 .../prometheus_metrics.py} | 0 .../running_jobs.py} | 2 +- .../runs.py} | 0 .../submitted_jobs.py} | 4 +- .../terminating_jobs.py} | 0 .../volumes.py} | 0 ..._add_computegroupmodel_pipeline_columns.py | 47 +++ ...dd_placementgroupmodel_pipeline_columns.py | 47 +++ ...0_add_pipeline_indexes_for_compute_and_.py | 57 +++ src/dstack/_internal/server/models.py | 28 +- .../services/jobs/configurators/base.py | 2 +- .../_internal/server/services/pipelines.py | 12 + src/dstack/_internal/settings.py | 3 + .../background/pipeline_tasks}/__init__.py | 0 .../background/pipeline_tasks/test_base.py | 183 ++++++++++ .../pipeline_tasks/test_compute_groups.py | 113 ++++++ .../pipeline_tasks/test_placement_groups.py | 63 ++++ .../{tasks => scheduled_tasks}/__init__.py | 0 .../test_compute_groups.py} | 4 +- .../test_events.py} | 2 +- .../test_fleets.py} | 2 +- .../test_gateways.py} | 2 +- .../test_idle_volumes.py} | 2 +- .../test_instances.py} | 42 +-- .../test_metrics.py} | 2 +- .../test_placement_groups.py} | 2 +- .../test_probes.py} | 6 +- .../test_prometheus_metrics.py} | 2 +- .../test_running_jobs.py} | 2 +- .../test_runs.py} | 12 +- .../test_submitted_jobs.py} | 2 +- .../test_submitted_volumes.py} | 2 +- .../test_terminating_jobs.py} | 2 +- .../_internal/server/routers/test_runs.py | 2 +- 52 files changed, 1792 insertions(+), 196 deletions(-) create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/__init__.py create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/base.py create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py create mode 100644 src/dstack/_internal/server/background/scheduled_tasks/__init__.py rename src/dstack/_internal/server/background/{tasks => scheduled_tasks}/common.py (100%) rename src/dstack/_internal/server/background/{tasks/process_compute_groups.py => scheduled_tasks/compute_groups.py} (100%) rename src/dstack/_internal/server/background/{tasks/process_events.py => scheduled_tasks/events.py} (100%) rename src/dstack/_internal/server/background/{tasks/process_fleets.py => scheduled_tasks/fleets.py} (100%) rename src/dstack/_internal/server/background/{tasks/process_gateways.py => scheduled_tasks/gateways.py} (100%) rename src/dstack/_internal/server/background/{tasks/process_idle_volumes.py => scheduled_tasks/idle_volumes.py} (100%) rename src/dstack/_internal/server/background/{tasks/process_instances.py => scheduled_tasks/instances.py} (99%) rename src/dstack/_internal/server/background/{tasks/process_metrics.py => scheduled_tasks/metrics.py} (100%) rename src/dstack/_internal/server/background/{tasks/process_placement_groups.py => scheduled_tasks/placement_groups.py} (100%) rename src/dstack/_internal/server/background/{tasks/process_probes.py => scheduled_tasks/probes.py} (100%) rename src/dstack/_internal/server/background/{tasks/process_prometheus_metrics.py => scheduled_tasks/prometheus_metrics.py} (100%) rename src/dstack/_internal/server/background/{tasks/process_running_jobs.py => scheduled_tasks/running_jobs.py} (99%) rename src/dstack/_internal/server/background/{tasks/process_runs.py => scheduled_tasks/runs.py} (100%) rename src/dstack/_internal/server/background/{tasks/process_submitted_jobs.py => scheduled_tasks/submitted_jobs.py} (99%) rename src/dstack/_internal/server/background/{tasks/process_terminating_jobs.py => scheduled_tasks/terminating_jobs.py} (100%) rename src/dstack/_internal/server/background/{tasks/process_volumes.py => scheduled_tasks/volumes.py} (100%) create mode 100644 src/dstack/_internal/server/migrations/versions/57cff3ec86ce_add_computegroupmodel_pipeline_columns.py create mode 100644 src/dstack/_internal/server/migrations/versions/9c2a227b0154_add_placementgroupmodel_pipeline_columns.py create mode 100644 src/dstack/_internal/server/migrations/versions/a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py create mode 100644 src/dstack/_internal/server/services/pipelines.py rename src/{dstack/_internal/server/background/tasks => tests/_internal/server/background/pipeline_tasks}/__init__.py (100%) create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_base.py create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_compute_groups.py create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py rename src/tests/_internal/server/background/{tasks => scheduled_tasks}/__init__.py (100%) rename src/tests/_internal/server/background/{tasks/test_process_compute_groups.py => scheduled_tasks/test_compute_groups.py} (96%) rename src/tests/_internal/server/background/{tasks/test_process_events.py => scheduled_tasks/test_events.py} (94%) rename src/tests/_internal/server/background/{tasks/test_process_fleets.py => scheduled_tasks/test_fleets.py} (98%) rename src/tests/_internal/server/background/{tasks/test_process_gateways.py => scheduled_tasks/test_gateways.py} (98%) rename src/tests/_internal/server/background/{tasks/test_process_idle_volumes.py => scheduled_tasks/test_idle_volumes.py} (98%) rename src/tests/_internal/server/background/{tasks/test_process_instances.py => scheduled_tasks/test_instances.py} (96%) rename src/tests/_internal/server/background/{tasks/test_process_metrics.py => scheduled_tasks/test_metrics.py} (98%) rename src/tests/_internal/server/background/{tasks/test_process_placement_groups.py => scheduled_tasks/test_placement_groups.py} (94%) rename src/tests/_internal/server/background/{tasks/test_process_probes.py => scheduled_tasks/test_probes.py} (96%) rename src/tests/_internal/server/background/{tasks/test_process_prometheus_metrics.py => scheduled_tasks/test_prometheus_metrics.py} (98%) rename src/tests/_internal/server/background/{tasks/test_process_running_jobs.py => scheduled_tasks/test_running_jobs.py} (99%) rename src/tests/_internal/server/background/{tasks/test_process_runs.py => scheduled_tasks/test_runs.py} (98%) rename src/tests/_internal/server/background/{tasks/test_process_submitted_jobs.py => scheduled_tasks/test_submitted_jobs.py} (99%) rename src/tests/_internal/server/background/{tasks/test_process_submitted_volumes.py => scheduled_tasks/test_submitted_volumes.py} (96%) rename src/tests/_internal/server/background/{tasks/test_process_terminating_jobs.py => scheduled_tasks/test_terminating_jobs.py} (99%) diff --git a/src/dstack/_internal/core/models/compute_groups.py b/src/dstack/_internal/core/models/compute_groups.py index 66e1292eff..3fa967494d 100644 --- a/src/dstack/_internal/core/models/compute_groups.py +++ b/src/dstack/_internal/core/models/compute_groups.py @@ -12,6 +12,13 @@ class ComputeGroupStatus(str, enum.Enum): RUNNING = "running" TERMINATED = "terminated" + @classmethod + def finished_statuses(cls) -> List["ComputeGroupStatus"]: + return [cls.TERMINATED] + + def is_finished(self): + return self in self.finished_statuses() + class ComputeGroupProvisioningData(CoreModel): compute_group_id: str diff --git a/src/dstack/_internal/server/app.py b/src/dstack/_internal/server/app.py index dbea6f777b..209679f0ef 100644 --- a/src/dstack/_internal/server/app.py +++ b/src/dstack/_internal/server/app.py @@ -23,8 +23,9 @@ from dstack._internal.proxy.lib.deps import get_injector_from_app from dstack._internal.proxy.lib.routers import model_proxy from dstack._internal.server import settings -from dstack._internal.server.background import start_background_tasks -from dstack._internal.server.background.tasks.process_probes import PROBES_SCHEDULER +from dstack._internal.server.background.pipeline_tasks import start_pipeline_tasks +from dstack._internal.server.background.scheduled_tasks import start_scheduled_tasks +from dstack._internal.server.background.scheduled_tasks.probes import PROBES_SCHEDULER from dstack._internal.server.db import get_db, get_session_ctx, migrate from dstack._internal.server.routers import ( auth, @@ -163,8 +164,11 @@ async def lifespan(app: FastAPI): if settings.SERVER_S3_BUCKET is not None or settings.SERVER_GCS_BUCKET is not None: init_default_storage() scheduler = None + pipeline_manager = None if settings.SERVER_BACKGROUND_PROCESSING_ENABLED: - scheduler = start_background_tasks() + scheduler = start_scheduled_tasks() + pipeline_manager = start_pipeline_tasks() + app.state.pipeline_manager = pipeline_manager else: logger.info("Background processing is disabled") PROBES_SCHEDULER.start() @@ -189,9 +193,15 @@ async def lifespan(app: FastAPI): for func in _ON_STARTUP_HOOKS: await func(app) yield + PROBES_SCHEDULER.shutdown(wait=False) + if pipeline_manager is not None: + pipeline_manager.shutdown() if scheduler is not None: + # Note: Scheduler does not cancel currently running jobs, so scheduled tasks cannot do cleanup. + # TODO: Track and cancel scheduled tasks. scheduler.shutdown() - PROBES_SCHEDULER.shutdown(wait=False) + if pipeline_manager is not None: + await pipeline_manager.drain() await gateway_connections_pool.remove_all() service_conn_pool = await get_injector_from_app(app).get_service_connection_pool() await service_conn_pool.remove_all() diff --git a/src/dstack/_internal/server/background/__init__.py b/src/dstack/_internal/server/background/__init__.py index 8577cce6f1..e69de29bb2 100644 --- a/src/dstack/_internal/server/background/__init__.py +++ b/src/dstack/_internal/server/background/__init__.py @@ -1,142 +0,0 @@ -from apscheduler.schedulers.asyncio import AsyncIOScheduler -from apscheduler.triggers.interval import IntervalTrigger - -from dstack._internal.server import settings -from dstack._internal.server.background.tasks.process_compute_groups import process_compute_groups -from dstack._internal.server.background.tasks.process_events import delete_events -from dstack._internal.server.background.tasks.process_fleets import process_fleets -from dstack._internal.server.background.tasks.process_gateways import ( - process_gateways, - process_gateways_connections, -) -from dstack._internal.server.background.tasks.process_idle_volumes import process_idle_volumes -from dstack._internal.server.background.tasks.process_instances import ( - delete_instance_health_checks, - process_instances, -) -from dstack._internal.server.background.tasks.process_metrics import ( - collect_metrics, - delete_metrics, -) -from dstack._internal.server.background.tasks.process_placement_groups import ( - process_placement_groups, -) -from dstack._internal.server.background.tasks.process_probes import process_probes -from dstack._internal.server.background.tasks.process_prometheus_metrics import ( - collect_prometheus_metrics, - delete_prometheus_metrics, -) -from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs -from dstack._internal.server.background.tasks.process_runs import process_runs -from dstack._internal.server.background.tasks.process_submitted_jobs import process_submitted_jobs -from dstack._internal.server.background.tasks.process_terminating_jobs import ( - process_terminating_jobs, -) -from dstack._internal.server.background.tasks.process_volumes import process_submitted_volumes - -_scheduler = AsyncIOScheduler() - - -def get_scheduler() -> AsyncIOScheduler: - return _scheduler - - -def start_background_tasks() -> AsyncIOScheduler: - # Background processing is implemented via in-memory locks on SQLite - # and SELECT FOR UPDATE on Postgres. Locks may be held for a long time. - # This is currently the main bottleneck for scaling dstack processing - # as processing more resources requires more DB connections. - # TODO: Make background processing efficient by committing locks to DB - # and processing outside of DB transactions. - # - # Now we just try to process as many resources as possible without exhausting DB connections. - # - # Quick tasks can process multiple resources per transaction. - # Potentially long tasks process one resource per transaction - # to avoid holding locks for all the resources if one is slow to process. - # Still, the next batch won't be processed unless all resources are processed, - # so larger batches do not increase processing rate linearly. - # - # The interval, batch_size, and max_instances determine background tasks processing rates. - # By default, one server replica can handle: - # - # * 150 active jobs with 2 minutes processing latency - # * 150 active runs with 2 minutes processing latency - # * 150 active instances with 2 minutes processing latency - # - # These latency numbers do not account for provisioning time, - # so it may be slower if a backend is slow to provision. - # - # Users can set SERVER_BACKGROUND_PROCESSING_FACTOR to process more resources per replica. - # They also need to increase max db connections on the client side and db side. - # - # In-memory locking via locksets does not guarantee - # that the first waiting for the lock will acquire it. - # The jitter is needed to give all tasks a chance to acquire locks. - - _scheduler.add_job(process_probes, IntervalTrigger(seconds=3, jitter=1)) - _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1) - _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1) - _scheduler.add_job(delete_events, IntervalTrigger(minutes=7), max_instances=1) - if settings.ENABLE_PROMETHEUS_METRICS: - _scheduler.add_job( - collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1 - ) - _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1) - _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15)) - _scheduler.add_job(process_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5) - _scheduler.add_job( - process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5 - ) - _scheduler.add_job( - process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1 - ) - _scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5)) - _scheduler.add_job( - process_fleets, - IntervalTrigger(seconds=10, jitter=2), - max_instances=1, - ) - _scheduler.add_job(delete_instance_health_checks, IntervalTrigger(minutes=5), max_instances=1) - for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR): - # Add multiple copies of tasks if requested. - # max_instances=1 for additional copies to avoid running too many tasks. - # Move other tasks here when they need per-replica scaling. - _scheduler.add_job( - process_submitted_jobs, - IntervalTrigger(seconds=4, jitter=2), - kwargs={"batch_size": 5}, - max_instances=4 if replica == 0 else 1, - ) - _scheduler.add_job( - process_running_jobs, - IntervalTrigger(seconds=4, jitter=2), - kwargs={"batch_size": 5}, - max_instances=2 if replica == 0 else 1, - ) - _scheduler.add_job( - process_terminating_jobs, - IntervalTrigger(seconds=4, jitter=2), - kwargs={"batch_size": 5}, - max_instances=2 if replica == 0 else 1, - ) - _scheduler.add_job( - process_runs, - IntervalTrigger(seconds=2, jitter=1), - kwargs={"batch_size": 5}, - max_instances=2 if replica == 0 else 1, - ) - _scheduler.add_job( - process_instances, - IntervalTrigger(seconds=4, jitter=2), - kwargs={"batch_size": 5}, - max_instances=2 if replica == 0 else 1, - ) - _scheduler.add_job( - process_compute_groups, - IntervalTrigger(seconds=15, jitter=2), - kwargs={"batch_size": 1}, - max_instances=2 if replica == 0 else 1, - ) - _scheduler.start() - return _scheduler diff --git a/src/dstack/_internal/server/background/pipeline_tasks/__init__.py b/src/dstack/_internal/server/background/pipeline_tasks/__init__.py new file mode 100644 index 0000000000..355e042476 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/__init__.py @@ -0,0 +1,69 @@ +import asyncio + +from dstack._internal.server.background.pipeline_tasks.base import Pipeline +from dstack._internal.server.background.pipeline_tasks.compute_groups import ComputeGroupPipeline +from dstack._internal.server.background.pipeline_tasks.placement_groups import ( + PlacementGroupPipeline, +) +from dstack._internal.settings import FeatureFlags +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class PipelineManager: + def __init__(self) -> None: + self._pipelines: list[Pipeline] = [] + if FeatureFlags.PIPELINE_PROCESSING_ENABLED: + self._pipelines += [ + ComputeGroupPipeline(), + PlacementGroupPipeline(), + ] + self._hinter = PipelineHinter(self._pipelines) + + def start(self): + for pipeline in self._pipelines: + pipeline.start() + + def shutdown(self): + for pipeline in self._pipelines: + pipeline.shutdown() + + async def drain(self): + results = await asyncio.gather( + *[p.drain() for p in self._pipelines], return_exceptions=True + ) + for pipeline, result in zip(self._pipelines, results): + if isinstance(result, BaseException): + logger.error( + "Unexpected exception when draining pipeline %r", + pipeline, + exc_info=(type(result), result, result.__traceback__), + ) + + @property + def hinter(self): + return self._hinter + + +class PipelineHinter: + def __init__(self, pipelines: list[Pipeline]) -> None: + self._pipelines = pipelines + self._hint_fetch_map = {p.hint_fetch_model_name: p for p in self._pipelines} + + def hint_fetch(self, model_name: str): + pipeline = self._hint_fetch_map.get(model_name) + if pipeline is None: + logger.warning("Model %s not registered for fetch hints", model_name) + return + pipeline.hint_fetch() + + +def start_pipeline_tasks() -> PipelineManager: + """ + Start tasks processed by fetch-workers pipelines based on db + in-memory queues. + Suitable for tasks that run frequently and need to lock rows for a long time. + """ + pipeline_manager = PipelineManager() + pipeline_manager.start() + return pipeline_manager diff --git a/src/dstack/_internal/server/background/pipeline_tasks/base.py b/src/dstack/_internal/server/background/pipeline_tasks/base.py new file mode 100644 index 0000000000..30be480bf9 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/base.py @@ -0,0 +1,344 @@ +import asyncio +import math +import random +import uuid +from abc import ABC, abstractmethod +from dataclasses import dataclass +from datetime import datetime, timedelta +from typing import Any, ClassVar, Generic, Optional, Protocol, Sequence, TypeVar + +from sqlalchemy import and_, or_, update +from sqlalchemy.orm import Mapped + +from dstack._internal.server.db import get_session_ctx +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class PipelineItem: + __tablename__: str + id: uuid.UUID + lock_expires_at: datetime + lock_token: uuid.UUID + prev_lock_expired: bool + + +class PipelineModel(Protocol): + __tablename__: str + __mapper__: ClassVar[Any] + __table__: ClassVar[Any] + id: Mapped[uuid.UUID] + lock_expires_at: Mapped[Optional[datetime]] + lock_token: Mapped[Optional[uuid.UUID]] + + +class PipelineError(Exception): + pass + + +class Pipeline(ABC): + def __init__( + self, + workers_num: int, + queue_lower_limit_factor: float, + queue_upper_limit_factor: float, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeat_trigger: timedelta, + ) -> None: + self._workers_num = workers_num + self._queue_lower_limit_factor = queue_lower_limit_factor + self._queue_upper_limit_factor = queue_upper_limit_factor + self._queue_desired_minsize = math.ceil(workers_num * queue_lower_limit_factor) + self._queue_maxsize = math.ceil(workers_num * queue_upper_limit_factor) + self._min_processing_interval = min_processing_interval + self._lock_timeout = lock_timeout + self._heartbeat_trigger = heartbeat_trigger + self._queue = asyncio.Queue[PipelineItem](maxsize=self._queue_maxsize) + self._tasks: list[asyncio.Task] = [] + self._running = False + self._shutdown = False + + def start(self): + """ + Starts all pipeline tasks. + """ + if self._running: + return + if self._shutdown: + raise PipelineError("Cannot start pipeline after shutdown.") + self._running = True + self._tasks.append(asyncio.create_task(self._heartbeater.start())) + for worker in self._workers: + self._tasks.append(asyncio.create_task(worker.start())) + self._tasks.append(asyncio.create_task(self._fetcher.start())) + + def shutdown(self): + """ + Stops the pipeline from processing new items and signals running tasks to cancel. + """ + if self._shutdown: + return + self._shutdown = True + self._running = False + self._fetcher.stop() + for worker in self._workers: + worker.stop() + self._heartbeater.stop() + for task in self._tasks: + if not task.done(): + task.cancel() + + async def drain(self): + """ + Waits for all pipeline tasks to finish cleanup after shutdown. + """ + if not self._shutdown: + raise PipelineError("Cannot drain running pipeline. Call `shutdown()` first.") + results = await asyncio.gather(*self._tasks, return_exceptions=True) + for task, result in zip(self._tasks, results): + if isinstance(result, BaseException) and not isinstance( + result, asyncio.CancelledError + ): + logger.error( + "Unexpected exception when draining pipeline task %r", + task, + exc_info=(type(result), result, result.__traceback__), + ) + + def hint_fetch(self): + self._fetcher.hint() + + @property + @abstractmethod + def hint_fetch_model_name(self) -> str: + pass + + @property + @abstractmethod + def _heartbeater(self) -> "Heartbeater": + pass + + @property + @abstractmethod + def _fetcher(self) -> "Fetcher": + pass + + @property + @abstractmethod + def _workers(self) -> Sequence["Worker"]: + pass + + +ModelT = TypeVar("ModelT", bound=PipelineModel) + + +class Heartbeater(Generic[ModelT]): + def __init__( + self, + model_type: type[ModelT], + lock_timeout: timedelta, + heartbeat_trigger: timedelta, + heartbeat_delay: float = 1.0, + ) -> None: + self._model_type = model_type + self._lock_timeout = lock_timeout + self._hearbeat_margin = heartbeat_trigger + self._items: dict[uuid.UUID, PipelineItem] = {} + self._untrack_lock = asyncio.Lock() + self._heartbeat_delay = heartbeat_delay + self._running = False + + async def start(self): + self._running = True + while self._running: + try: + await self.heartbeat() + except Exception: + logger.exception("Unexpected exception when running heartbeat") + await asyncio.sleep(self._heartbeat_delay) + + def stop(self): + self._running = False + + async def track(self, item: PipelineItem): + self._items[item.id] = item + + async def untrack(self, item: PipelineItem): + async with self._untrack_lock: + tracked = self._items.get(item.id) + # Prevent expired fetch iteration to unlock item processed by new iteration. + if tracked is not None and tracked.lock_token == item.lock_token: + del self._items[item.id] + + async def heartbeat(self): + items_to_update: list[PipelineItem] = [] + now = get_current_datetime() + items = list(self._items.values()) + failed_to_heartbeat_count = 0 + for item in items: + if item.lock_expires_at < now: + failed_to_heartbeat_count += 1 + await self.untrack(item) + elif item.lock_expires_at < now + self._hearbeat_margin: + items_to_update.append(item) + if failed_to_heartbeat_count > 0: + logger.warning( + "Failed to heartbeat %d %s items in time." + " The items are expected to be processed on another fetch iteration.", + failed_to_heartbeat_count, + self._model_type.__tablename__, + ) + if len(items_to_update) == 0: + return + logger.debug( + "Updating lock_expires_at for items: %s", [str(r.id) for r in items_to_update] + ) + async with get_session_ctx() as session: + per_item_filters = [ + and_( + self._model_type.id == item.id, self._model_type.lock_token == item.lock_token + ) + for item in items_to_update + ] + res = await session.execute( + update(self._model_type) + .where(or_(*per_item_filters)) + .values(lock_expires_at=now + self._lock_timeout) + .returning(self._model_type.id) + ) + updated_ids = set(res.scalars().all()) + failed_to_update_count = 0 + for item in items_to_update: + if item.id in updated_ids: + item.lock_expires_at = now + self._lock_timeout + else: + failed_to_update_count += 1 + await self.untrack(item) + if failed_to_update_count > 0: + logger.warning( + "Failed to update %s lock_expires_at of %d items: lock_token changed." + " The items are expected to be processed and updated on another fetch iteration.", + self._model_type.__tablename__, + failed_to_update_count, + ) + + +class Fetcher(ABC): + _DEFAULT_FETCH_DELAYS = [0.5, 1, 2, 5] + + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater, + queue_check_delay: float = 1.0, + fetch_delays: Optional[list[float]] = None, + ) -> None: + self._queue = queue + self._queue_desired_minsize = queue_desired_minsize + self._min_processing_interval = min_processing_interval + self._lock_timeout = lock_timeout + self._heartbeater = heartbeater + self._queue_check_delay = queue_check_delay + if fetch_delays is None: + fetch_delays = self._DEFAULT_FETCH_DELAYS + self._fetch_delays = fetch_delays + self._running = False + self._fetch_event = asyncio.Event() + + async def start(self): + self._running = True + empty_fetch_count = 0 + while self._running: + if self._queue.qsize() >= self._queue_desired_minsize: + await asyncio.sleep(self._queue_check_delay) + continue + fetch_limit = self._queue.maxsize - self._queue.qsize() + try: + items = await self.fetch(limit=fetch_limit) + except Exception: + logger.exception("Unexpected exception when fetching new items") + items = [] + if len(items) == 0: + try: + await asyncio.wait_for( + self._fetch_event.wait(), + timeout=self._next_fetch_delay(empty_fetch_count), + ) + except TimeoutError: + pass + empty_fetch_count += 1 + self._fetch_event.clear() + continue + else: + empty_fetch_count = 0 + for item in items: + self._queue.put_nowait(item) # should never raise + await self._heartbeater.track(item) + + def stop(self): + self._running = False + + def hint(self): + self._fetch_event.set() + + @abstractmethod + async def fetch(self, limit: int) -> list[PipelineItem]: + pass + + def _next_fetch_delay(self, empty_fetch_count: int) -> float: + next_delay = self._fetch_delays[min(empty_fetch_count, len(self._fetch_delays) - 1)] + jitter = random.random() * 0.4 - 0.2 + return next_delay * (1 + jitter) + + +class Worker(ABC): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + heartbeater: Heartbeater, + ) -> None: + self._queue = queue + self._heartbeater = heartbeater + self._running = False + + async def start(self): + self._running = True + while self._running: + item = await self._queue.get() + logger.debug("Processing %s item %s", item.__tablename__, item.id) + try: + await self.process(item) + except Exception: + logger.exception("Unexpected exception when processing item") + finally: + await self._heartbeater.untrack(item) + logger.debug("Processed %s item %s", item.__tablename__, item.id) + + def stop(self): + self._running = False + + @abstractmethod + async def process(self, item: PipelineItem): + pass + + +UpdateMap = dict[str, Any] + + +def get_unlock_update_map() -> UpdateMap: + return { + "lock_expires_at": None, + "lock_token": None, + "lock_owner": None, + } + + +def get_processed_update_map() -> UpdateMap: + return {"last_processed_at": get_current_datetime()} diff --git a/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py b/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py new file mode 100644 index 0000000000..685c5205a8 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py @@ -0,0 +1,335 @@ +import asyncio +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from typing import Sequence + +from sqlalchemy import or_, select, update +from sqlalchemy.orm import joinedload, load_only + +from dstack._internal.core.backends.base.compute import ComputeWithGroupProvisioningSupport +from dstack._internal.core.errors import BackendError +from dstack._internal.core.models.compute_groups import ComputeGroupStatus +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.server.background.pipeline_tasks.base import ( + Fetcher, + Heartbeater, + Pipeline, + PipelineItem, + UpdateMap, + Worker, + get_processed_update_map, + get_unlock_update_map, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ComputeGroupModel, InstanceModel, ProjectModel +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services.compute_groups import compute_group_model_to_compute_group +from dstack._internal.server.services.instances import switch_instance_status +from dstack._internal.server.services.locking import get_locker +from dstack._internal.utils.common import get_current_datetime, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +TERMINATION_RETRY_TIMEOUT = timedelta(seconds=60) +TERMINATION_RETRY_MAX_DURATION = timedelta(minutes=15) + + +class ComputeGroupPipeline(Pipeline): + def __init__( + self, + workers_num: int = 10, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=15), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[ComputeGroupModel]( + model_type=ComputeGroupModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = ComputeGroupFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + ComputeGroupWorker(queue=self._queue, heartbeater=self._heartbeater) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return ComputeGroupModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher: + return self.__fetcher + + @property + def _workers(self) -> Sequence["ComputeGroupWorker"]: + return self.__workers + + +class ComputeGroupFetcher(Fetcher): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[ComputeGroupModel], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + async def fetch(self, limit: int) -> list[PipelineItem]: + compute_group_lock, _ = get_locker(get_db().dialect_name).get_lockset( + ComputeGroupModel.__tablename__ + ) + async with compute_group_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(ComputeGroupModel) + .where( + ComputeGroupModel.status.not_in(ComputeGroupStatus.finished_statuses()), + ComputeGroupModel.last_processed_at <= now - self._min_processing_interval, + or_( + ComputeGroupModel.lock_expires_at.is_(None), + ComputeGroupModel.lock_expires_at < now, + ), + or_( + ComputeGroupModel.lock_owner.is_(None), + ComputeGroupModel.lock_owner == ComputeGroupPipeline.__name__, + ), + ) + .order_by(ComputeGroupModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True, of=ComputeGroupModel) + .options( + load_only( + ComputeGroupModel.id, + ComputeGroupModel.lock_token, + ComputeGroupModel.lock_expires_at, + ) + ) + ) + compute_group_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for compute_group_model in compute_group_models: + prev_lock_expired = compute_group_model.lock_expires_at is not None + compute_group_model.lock_expires_at = lock_expires_at + compute_group_model.lock_token = lock_token + compute_group_model.lock_owner = ComputeGroupPipeline.__name__ + items.append( + PipelineItem( + __tablename__=ComputeGroupModel.__tablename__, + id=compute_group_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + ) + ) + await session.commit() + return items + + +class ComputeGroupWorker(Worker): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + heartbeater: Heartbeater[ComputeGroupModel], + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + ) + + async def process(self, item: PipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(ComputeGroupModel) + .where( + ComputeGroupModel.id == item.id, + ComputeGroupModel.lock_token == item.lock_token, + ) + # Terminating instances belonging to a compute group are locked implicitly by locking the compute group. + .options( + joinedload(ComputeGroupModel.instances), + joinedload(ComputeGroupModel.project).joinedload(ProjectModel.backends), + ) + ) + compute_group_model = res.unique().scalar_one_or_none() + if compute_group_model is None: + logger.warning( + "Failed to process %s item %s: lock_token mismatch." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + + terminate_result = _TerminateResult() + # TODO: Fetch only compute groups with all instances terminating. + if all(i.status == InstanceStatus.TERMINATING for i in compute_group_model.instances): + terminate_result = await _terminate_compute_group(compute_group_model) + if terminate_result.compute_group_update_map: + logger.info("Terminated compute group %s", compute_group_model.id) + else: + terminate_result.compute_group_update_map = get_processed_update_map() + + terminate_result.compute_group_update_map |= get_unlock_update_map() + + async with get_session_ctx() as session: + res = await session.execute( + update(ComputeGroupModel) + .where( + ComputeGroupModel.id == compute_group_model.id, + ComputeGroupModel.lock_token == compute_group_model.lock_token, + ) + .values(**terminate_result.compute_group_update_map) + .returning(ComputeGroupModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + logger.warning( + "Failed to update %s item %s after processing: lock_token changed." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + if not terminate_result.instances_update_map: + return + instances_ids = [i.id for i in compute_group_model.instances] + res = await session.execute( + update(InstanceModel) + .where(InstanceModel.id.in_(instances_ids)) + .values(**terminate_result.instances_update_map) + ) + for instance_model in compute_group_model.instances: + switch_instance_status(session, instance_model, InstanceStatus.TERMINATED) + + +@dataclass +class _TerminateResult: + compute_group_update_map: UpdateMap = field(default_factory=dict) + instances_update_map: UpdateMap = field(default_factory=dict) + + +async def _terminate_compute_group(compute_group_model: ComputeGroupModel) -> _TerminateResult: + result = _TerminateResult() + if ( + compute_group_model.last_termination_retry_at is not None + and _next_termination_retry_at(compute_group_model.last_termination_retry_at) + > get_current_datetime() + ): + return result + compute_group = compute_group_model_to_compute_group(compute_group_model) + cgpd = compute_group.provisioning_data + backend = await backends_services.get_project_backend_by_type( + project=compute_group_model.project, + backend_type=cgpd.backend, + ) + if backend is None: + logger.error( + "Failed to terminate compute group %s. Backend %s not available." + " Please terminate it manually to avoid unexpected charges.", + compute_group.name, + cgpd.backend, + ) + return _get_terminated_result() + logger.debug("Terminating compute group %s", compute_group.name) + compute = backend.compute() + assert isinstance(compute, ComputeWithGroupProvisioningSupport) + try: + await run_async( + compute.terminate_compute_group, + compute_group, + ) + except Exception as e: + if compute_group_model.first_termination_retry_at is None: + result.compute_group_update_map["first_termination_retry_at"] = get_current_datetime() + result.compute_group_update_map["last_termination_retry_at"] = get_current_datetime() + if _next_termination_retry_at( + result.compute_group_update_map["last_termination_retry_at"] + ) < _get_termination_deadline( + result.compute_group_update_map.get( + "first_termination_retry_at", compute_group_model.first_termination_retry_at + ) + ): + logger.warning( + "Failed to terminate compute group %s. Will retry. Error: %r", + compute_group.name, + e, + exc_info=not isinstance(e, BackendError), + ) + return result + logger.error( + "Failed all attempts to terminate compute group %s." + " Please terminate it manually to avoid unexpected charges." + " Error: %r", + compute_group.name, + e, + exc_info=not isinstance(e, BackendError), + ) + terminated_result = _get_terminated_result() + return _TerminateResult( + compute_group_update_map=result.compute_group_update_map + | terminated_result.compute_group_update_map, + instances_update_map=result.instances_update_map | terminated_result.instances_update_map, + ) + + +def _next_termination_retry_at(last_termination_retry_at: datetime) -> datetime: + return last_termination_retry_at + TERMINATION_RETRY_TIMEOUT + + +def _get_termination_deadline(first_termination_retry_at: datetime) -> datetime: + return first_termination_retry_at + TERMINATION_RETRY_MAX_DURATION + + +def _get_terminated_result() -> _TerminateResult: + now = get_current_datetime() + return _TerminateResult( + compute_group_update_map={ + "last_processed_at": now, + "deleted": True, + "deleted_at": now, + "status": ComputeGroupStatus.TERMINATED, + }, + instances_update_map={ + "last_processed_at": now, + "deleted": True, + "deleted_at": now, + "finished_at": now, + "status": InstanceStatus.TERMINATED, + }, + ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py b/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py new file mode 100644 index 0000000000..9fac5665a5 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py @@ -0,0 +1,263 @@ +import asyncio +import uuid +from datetime import timedelta +from typing import Sequence + +from sqlalchemy import or_, select, update +from sqlalchemy.orm import joinedload, load_only + +from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport +from dstack._internal.core.errors import PlacementGroupInUseError +from dstack._internal.server.background.pipeline_tasks.base import ( + Fetcher, + Heartbeater, + Pipeline, + PipelineItem, + UpdateMap, + Worker, + get_processed_update_map, + get_unlock_update_map, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + PlacementGroupModel, + ProjectModel, +) +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.placement import placement_group_model_to_placement_group +from dstack._internal.utils.common import get_current_datetime, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class PlacementGroupPipeline(Pipeline): + def __init__( + self, + workers_num: int = 10, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=15), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[PlacementGroupModel]( + model_type=PlacementGroupModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = PlacementGroupFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + PlacementGroupWorker(queue=self._queue, heartbeater=self._heartbeater) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return PlacementGroupModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher: + return self.__fetcher + + @property + def _workers(self) -> Sequence["PlacementGroupWorker"]: + return self.__workers + + +class PlacementGroupFetcher(Fetcher): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[PlacementGroupModel], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + async def fetch(self, limit: int) -> list[PipelineItem]: + placement_group_lock, _ = get_locker(get_db().dialect_name).get_lockset( + PlacementGroupModel.__tablename__ + ) + async with placement_group_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(PlacementGroupModel) + .where( + PlacementGroupModel.fleet_deleted == True, + PlacementGroupModel.deleted == False, + PlacementGroupModel.last_processed_at + <= now - self._min_processing_interval, + or_( + PlacementGroupModel.lock_expires_at.is_(None), + PlacementGroupModel.lock_expires_at < now, + ), + or_( + PlacementGroupModel.lock_owner.is_(None), + PlacementGroupModel.lock_owner == PlacementGroupPipeline.__name__, + ), + ) + .order_by(PlacementGroupModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True) + .options( + load_only( + PlacementGroupModel.id, + PlacementGroupModel.lock_token, + PlacementGroupModel.lock_expires_at, + ) + ) + ) + placement_group_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for placement_group_model in placement_group_models: + prev_lock_expired = placement_group_model.lock_expires_at is not None + placement_group_model.lock_expires_at = lock_expires_at + placement_group_model.lock_token = lock_token + placement_group_model.lock_owner = PlacementGroupPipeline.__name__ + items.append( + PipelineItem( + __tablename__=PlacementGroupModel.__tablename__, + id=placement_group_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + ) + ) + await session.commit() + return items + + +class PlacementGroupWorker(Worker): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + heartbeater: Heartbeater[PlacementGroupModel], + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + ) + + async def process(self, item: PipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(PlacementGroupModel) + .where( + PlacementGroupModel.id == item.id, + PlacementGroupModel.lock_token == item.lock_token, + ) + .options(joinedload(PlacementGroupModel.project).joinedload(ProjectModel.backends)) + ) + placement_group_model = res.unique().scalar_one_or_none() + if placement_group_model is None: + logger.warning( + "Failed to process %s item %s: lock_token mismatch." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + + update_map = await _delete_placement_group(placement_group_model) + if update_map: + logger.info("Deleted placement group %s", placement_group_model.name) + else: + update_map = get_processed_update_map() + + update_map |= get_unlock_update_map() + + async with get_session_ctx() as session: + res = await session.execute( + update(PlacementGroupModel) + .where( + PlacementGroupModel.id == placement_group_model.id, + PlacementGroupModel.lock_token == placement_group_model.lock_token, + ) + .values(**update_map) + .returning(PlacementGroupModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + logger.warning( + "Failed to update %s item %s after processing: lock_token changed." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + + +async def _delete_placement_group(placement_group_model: PlacementGroupModel) -> UpdateMap: + placement_group = placement_group_model_to_placement_group(placement_group_model) + if placement_group.provisioning_data is None: + logger.error( + "Failed to delete placement group %s. provisioning_data is None.", placement_group.name + ) + return _get_deleted_update_map() + backend = await backends_services.get_project_backend_by_type( + project=placement_group_model.project, + backend_type=placement_group.provisioning_data.backend, + ) + if backend is None: + logger.error( + "Failed to delete placement group %s. Backend not available. Please delete it manually.", + placement_group.name, + ) + return _get_deleted_update_map() + compute = backend.compute() + assert isinstance(compute, ComputeWithPlacementGroupSupport) + try: + await run_async(compute.delete_placement_group, placement_group) + except PlacementGroupInUseError: + logger.info( + "Placement group %s is still in use. Skipping deletion for now.", placement_group.name + ) + return {} + except Exception: + logger.exception( + "Got exception when deleting placement group %s. Please delete it manually.", + placement_group.name, + ) + return _get_deleted_update_map() + + return _get_deleted_update_map() + + +def _get_deleted_update_map() -> UpdateMap: + now = get_current_datetime() + return { + "last_processed_at": now, + "deleted": True, + "deleted_at": now, + } diff --git a/src/dstack/_internal/server/background/scheduled_tasks/__init__.py b/src/dstack/_internal/server/background/scheduled_tasks/__init__.py new file mode 100644 index 0000000000..c4baf96c58 --- /dev/null +++ b/src/dstack/_internal/server/background/scheduled_tasks/__init__.py @@ -0,0 +1,159 @@ +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from apscheduler.triggers.interval import IntervalTrigger + +from dstack._internal.server import settings +from dstack._internal.server.background.scheduled_tasks.compute_groups import ( + process_compute_groups, +) +from dstack._internal.server.background.scheduled_tasks.events import delete_events +from dstack._internal.server.background.scheduled_tasks.fleets import process_fleets +from dstack._internal.server.background.scheduled_tasks.gateways import ( + process_gateways, + process_gateways_connections, +) +from dstack._internal.server.background.scheduled_tasks.idle_volumes import ( + process_idle_volumes, +) +from dstack._internal.server.background.scheduled_tasks.instances import ( + delete_instance_health_checks, + process_instances, +) +from dstack._internal.server.background.scheduled_tasks.metrics import ( + collect_metrics, + delete_metrics, +) +from dstack._internal.server.background.scheduled_tasks.placement_groups import ( + process_placement_groups, +) +from dstack._internal.server.background.scheduled_tasks.probes import process_probes +from dstack._internal.server.background.scheduled_tasks.prometheus_metrics import ( + collect_prometheus_metrics, + delete_prometheus_metrics, +) +from dstack._internal.server.background.scheduled_tasks.running_jobs import ( + process_running_jobs, +) +from dstack._internal.server.background.scheduled_tasks.runs import process_runs +from dstack._internal.server.background.scheduled_tasks.submitted_jobs import ( + process_submitted_jobs, +) +from dstack._internal.server.background.scheduled_tasks.terminating_jobs import ( + process_terminating_jobs, +) +from dstack._internal.server.background.scheduled_tasks.volumes import ( + process_submitted_volumes, +) +from dstack._internal.settings import FeatureFlags + +_scheduler = AsyncIOScheduler() + + +def get_scheduler() -> AsyncIOScheduler: + return _scheduler + + +def start_scheduled_tasks() -> AsyncIOScheduler: + """ + Start periodic tasks triggered by `apscheduler` at specific times/intervals. + Suitable for tasks that run infrequently and don't need to lock rows for a long time. + """ + # Background processing is implemented via in-memory locks on SQLite + # and SELECT FOR UPDATE on Postgres. Locks may be held for a long time. + # This is currently the main bottleneck for scaling dstack processing + # as processing more resources requires more DB connections. + # TODO: Make background processing efficient by committing locks to DB + # and processing outside of DB transactions. + # + # Now we just try to process as many resources as possible without exhausting DB connections. + # + # Quick tasks can process multiple resources per transaction. + # Potentially long tasks process one resource per transaction + # to avoid holding locks for all the resources if one is slow to process. + # Still, the next batch won't be processed unless all resources are processed, + # so larger batches do not increase processing rate linearly. + # + # The interval, batch_size, and max_instances determine background tasks processing rates. + # By default, one server replica can handle: + # + # * 150 active jobs with 2 minutes processing latency + # * 150 active runs with 2 minutes processing latency + # * 150 active instances with 2 minutes processing latency + # + # These latency numbers do not account for provisioning time, + # so it may be slower if a backend is slow to provision. + # + # Users can set SERVER_BACKGROUND_PROCESSING_FACTOR to process more resources per replica. + # They also need to increase max db connections on the client side and db side. + # + # In-memory locking via locksets does not guarantee + # that the first waiting for the lock will acquire it. + # The jitter is needed to give all tasks a chance to acquire locks. + + _scheduler.add_job(process_probes, IntervalTrigger(seconds=3, jitter=1)) + _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1) + _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1) + _scheduler.add_job(delete_events, IntervalTrigger(minutes=7), max_instances=1) + if settings.ENABLE_PROMETHEUS_METRICS: + _scheduler.add_job( + collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1 + ) + _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1) + _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15)) + _scheduler.add_job(process_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5) + _scheduler.add_job( + process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5 + ) + _scheduler.add_job( + process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1 + ) + if not FeatureFlags.PIPELINE_PROCESSING_ENABLED: + _scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5)) + _scheduler.add_job( + process_fleets, + IntervalTrigger(seconds=10, jitter=2), + max_instances=1, + ) + _scheduler.add_job(delete_instance_health_checks, IntervalTrigger(minutes=5), max_instances=1) + for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR): + # Add multiple copies of tasks if requested. + # max_instances=1 for additional copies to avoid running too many tasks. + # Move other tasks here when they need per-replica scaling. + _scheduler.add_job( + process_submitted_jobs, + IntervalTrigger(seconds=4, jitter=2), + kwargs={"batch_size": 5}, + max_instances=4 if replica == 0 else 1, + ) + _scheduler.add_job( + process_running_jobs, + IntervalTrigger(seconds=4, jitter=2), + kwargs={"batch_size": 5}, + max_instances=2 if replica == 0 else 1, + ) + _scheduler.add_job( + process_terminating_jobs, + IntervalTrigger(seconds=4, jitter=2), + kwargs={"batch_size": 5}, + max_instances=2 if replica == 0 else 1, + ) + _scheduler.add_job( + process_runs, + IntervalTrigger(seconds=2, jitter=1), + kwargs={"batch_size": 5}, + max_instances=2 if replica == 0 else 1, + ) + _scheduler.add_job( + process_instances, + IntervalTrigger(seconds=4, jitter=2), + kwargs={"batch_size": 5}, + max_instances=2 if replica == 0 else 1, + ) + if not FeatureFlags.PIPELINE_PROCESSING_ENABLED: + _scheduler.add_job( + process_compute_groups, + IntervalTrigger(seconds=15, jitter=2), + kwargs={"batch_size": 1}, + max_instances=2 if replica == 0 else 1, + ) + _scheduler.start() + return _scheduler diff --git a/src/dstack/_internal/server/background/tasks/common.py b/src/dstack/_internal/server/background/scheduled_tasks/common.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/common.py rename to src/dstack/_internal/server/background/scheduled_tasks/common.py diff --git a/src/dstack/_internal/server/background/tasks/process_compute_groups.py b/src/dstack/_internal/server/background/scheduled_tasks/compute_groups.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/process_compute_groups.py rename to src/dstack/_internal/server/background/scheduled_tasks/compute_groups.py diff --git a/src/dstack/_internal/server/background/tasks/process_events.py b/src/dstack/_internal/server/background/scheduled_tasks/events.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/process_events.py rename to src/dstack/_internal/server/background/scheduled_tasks/events.py diff --git a/src/dstack/_internal/server/background/tasks/process_fleets.py b/src/dstack/_internal/server/background/scheduled_tasks/fleets.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/process_fleets.py rename to src/dstack/_internal/server/background/scheduled_tasks/fleets.py diff --git a/src/dstack/_internal/server/background/tasks/process_gateways.py b/src/dstack/_internal/server/background/scheduled_tasks/gateways.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/process_gateways.py rename to src/dstack/_internal/server/background/scheduled_tasks/gateways.py diff --git a/src/dstack/_internal/server/background/tasks/process_idle_volumes.py b/src/dstack/_internal/server/background/scheduled_tasks/idle_volumes.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/process_idle_volumes.py rename to src/dstack/_internal/server/background/scheduled_tasks/idle_volumes.py diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/scheduled_tasks/instances.py similarity index 99% rename from src/dstack/_internal/server/background/tasks/process_instances.py rename to src/dstack/_internal/server/background/scheduled_tasks/instances.py index da47cf16ed..196f347c4f 100644 --- a/src/dstack/_internal/server/background/tasks/process_instances.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/instances.py @@ -59,7 +59,7 @@ JobProvisioningData, ) from dstack._internal.server import settings as server_settings -from dstack._internal.server.background.tasks.common import get_provisioning_timeout +from dstack._internal.server.background.scheduled_tasks.common import get_provisioning_timeout from dstack._internal.server.db import get_db, get_session_ctx from dstack._internal.server.models import ( FleetModel, diff --git a/src/dstack/_internal/server/background/tasks/process_metrics.py b/src/dstack/_internal/server/background/scheduled_tasks/metrics.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/process_metrics.py rename to src/dstack/_internal/server/background/scheduled_tasks/metrics.py diff --git a/src/dstack/_internal/server/background/tasks/process_placement_groups.py b/src/dstack/_internal/server/background/scheduled_tasks/placement_groups.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/process_placement_groups.py rename to src/dstack/_internal/server/background/scheduled_tasks/placement_groups.py diff --git a/src/dstack/_internal/server/background/tasks/process_probes.py b/src/dstack/_internal/server/background/scheduled_tasks/probes.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/process_probes.py rename to src/dstack/_internal/server/background/scheduled_tasks/probes.py diff --git a/src/dstack/_internal/server/background/tasks/process_prometheus_metrics.py b/src/dstack/_internal/server/background/scheduled_tasks/prometheus_metrics.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/process_prometheus_metrics.py rename to src/dstack/_internal/server/background/scheduled_tasks/prometheus_metrics.py diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/scheduled_tasks/running_jobs.py similarity index 99% rename from src/dstack/_internal/server/background/tasks/process_running_jobs.py rename to src/dstack/_internal/server/background/scheduled_tasks/running_jobs.py index 7275106ceb..f413edf44b 100644 --- a/src/dstack/_internal/server/background/tasks/process_running_jobs.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/running_jobs.py @@ -37,7 +37,7 @@ RunStatus, ) from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint -from dstack._internal.server.background.tasks.common import get_provisioning_timeout +from dstack._internal.server.background.scheduled_tasks.common import get_provisioning_timeout from dstack._internal.server.db import get_db, get_session_ctx from dstack._internal.server.models import ( FleetModel, diff --git a/src/dstack/_internal/server/background/tasks/process_runs.py b/src/dstack/_internal/server/background/scheduled_tasks/runs.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/process_runs.py rename to src/dstack/_internal/server/background/scheduled_tasks/runs.py diff --git a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py b/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py similarity index 99% rename from src/dstack/_internal/server/background/tasks/process_submitted_jobs.py rename to src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py index a021096613..79746e9338 100644 --- a/src/dstack/_internal/server/background/tasks/process_submitted_jobs.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py @@ -57,7 +57,9 @@ from dstack._internal.core.models.volumes import Volume from dstack._internal.core.services.profiles import get_termination from dstack._internal.server import settings -from dstack._internal.server.background.tasks.process_compute_groups import ComputeGroupStatus +from dstack._internal.server.background.scheduled_tasks.compute_groups import ( + ComputeGroupStatus, +) from dstack._internal.server.db import ( get_db, get_session_ctx, diff --git a/src/dstack/_internal/server/background/tasks/process_terminating_jobs.py b/src/dstack/_internal/server/background/scheduled_tasks/terminating_jobs.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/process_terminating_jobs.py rename to src/dstack/_internal/server/background/scheduled_tasks/terminating_jobs.py diff --git a/src/dstack/_internal/server/background/tasks/process_volumes.py b/src/dstack/_internal/server/background/scheduled_tasks/volumes.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/process_volumes.py rename to src/dstack/_internal/server/background/scheduled_tasks/volumes.py diff --git a/src/dstack/_internal/server/migrations/versions/57cff3ec86ce_add_computegroupmodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/57cff3ec86ce_add_computegroupmodel_pipeline_columns.py new file mode 100644 index 0000000000..e341b3b4a4 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/57cff3ec86ce_add_computegroupmodel_pipeline_columns.py @@ -0,0 +1,47 @@ +"""Add ComputeGroupModel pipeline columns + +Revision ID: 57cff3ec86ce +Revises: 706e0acc3a7d +Create Date: 2026-02-18 11:07:48.686185 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "57cff3ec86ce" +down_revision = "706e0acc3a7d" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("compute_groups", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("compute_groups", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/9c2a227b0154_add_placementgroupmodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/9c2a227b0154_add_placementgroupmodel_pipeline_columns.py new file mode 100644 index 0000000000..56297fde36 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/9c2a227b0154_add_placementgroupmodel_pipeline_columns.py @@ -0,0 +1,47 @@ +"""Add PlacementGroupModel pipeline columns + +Revision ID: 9c2a227b0154 +Revises: 57cff3ec86ce +Create Date: 2026-02-18 11:08:57.860277 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "9c2a227b0154" +down_revision = "57cff3ec86ce" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("placement_groups", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("placement_groups", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py b/src/dstack/_internal/server/migrations/versions/a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py new file mode 100644 index 0000000000..ad35a23d06 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py @@ -0,0 +1,57 @@ +"""Add pipeline indexes for compute and placement groups + +Revision ID: a8ed24fd7f90 +Revises: 9c2a227b0154 +Create Date: 2026-02-18 11:22:25.972000 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "a8ed24fd7f90" +down_revision = "9c2a227b0154" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.create_index( + "ix_compute_groups_pipeline_fetch_q", + "compute_groups", + [sa.literal_column("last_processed_at ASC")], + unique=False, + postgresql_where=sa.text("(status NOT IN ('TERMINATED'))"), + sqlite_where=sa.text("(status NOT IN ('TERMINATED'))"), + postgresql_concurrently=True, + ) + op.create_index( + "ix_placement_groups_pipeline_fetch_q", + "placement_groups", + [sa.literal_column("last_processed_at ASC")], + unique=False, + postgresql_where=sa.text("deleted IS FALSE"), + sqlite_where=sa.text("deleted = 0"), + postgresql_concurrently=True, + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_placement_groups_pipeline_fetch_q", + "placement_groups", + postgresql_concurrently=True, + ) + op.drop_index( + "ix_compute_groups_pipeline_fetch_q", + "compute_groups", + postgresql_concurrently=True, + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index 7e9db282d1..a837137a10 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -196,6 +196,12 @@ class BaseModel(DeclarativeBase): metadata = MetaData(naming_convention=constraint_naming_convention) +class PipelineModelMixin: + lock_expires_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + lock_token: Mapped[Optional[uuid.UUID]] = mapped_column(UUIDType(binary=False)) + lock_owner: Mapped[Optional[str]] = mapped_column(String(100)) + + class UserModel(BaseModel): __tablename__ = "users" @@ -768,7 +774,7 @@ class VolumeAttachmentModel(BaseModel): attachment_data: Mapped[Optional[str]] = mapped_column(Text) -class PlacementGroupModel(BaseModel): +class PlacementGroupModel(PipelineModelMixin, BaseModel): __tablename__ = "placement_groups" id: Mapped[uuid.UUID] = mapped_column( @@ -794,8 +800,17 @@ class PlacementGroupModel(BaseModel): configuration: Mapped[str] = mapped_column(Text) provisioning_data: Mapped[Optional[str]] = mapped_column(Text) + __table_args__ = ( + Index( + "ix_placement_groups_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=deleted == false(), + sqlite_where=deleted == false(), + ), + ) + -class ComputeGroupModel(BaseModel): +class ComputeGroupModel(PipelineModelMixin, BaseModel): __tablename__ = "compute_groups" id: Mapped[uuid.UUID] = mapped_column( @@ -823,6 +838,15 @@ class ComputeGroupModel(BaseModel): instances: Mapped[List["InstanceModel"]] = relationship(back_populates="compute_group") + __table_args__ = ( + Index( + "ix_compute_groups_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=status.not_in(ComputeGroupStatus.finished_statuses()), + sqlite_where=status.not_in(ComputeGroupStatus.finished_statuses()), + ), + ) + class JobMetricsPoint(BaseModel): __tablename__ = "job_metrics_points" diff --git a/src/dstack/_internal/server/services/jobs/configurators/base.py b/src/dstack/_internal/server/services/jobs/configurators/base.py index a9496ad348..3310cda996 100644 --- a/src/dstack/_internal/server/services/jobs/configurators/base.py +++ b/src/dstack/_internal/server/services/jobs/configurators/base.py @@ -77,7 +77,7 @@ def get_default_python_verison() -> str: def get_default_image(nvcc: bool = False) -> str: """ Note: May be overridden by dstack (e.g., EFA-enabled version for AWS EFA-capable instances). - See `dstack._internal.server.background.tasks.process_running_jobs._patch_base_image_for_aws_efa` for details. + See `dstack._internal.server.background.scheduled_tasks.running_jobs._patch_base_image_for_aws_efa` for details. Args: nvcc: If True, returns 'devel' variant, otherwise 'base'. diff --git a/src/dstack/_internal/server/services/pipelines.py b/src/dstack/_internal/server/services/pipelines.py new file mode 100644 index 0000000000..19f4df902d --- /dev/null +++ b/src/dstack/_internal/server/services/pipelines.py @@ -0,0 +1,12 @@ +from typing import Protocol + +from fastapi import Request + + +class PipelineHinterProtocol(Protocol): + def hint_fetch(self, model_name: str) -> None: + pass + + +def get_pipeline_hinter(request: Request) -> PipelineHinterProtocol: + return request.app.state.pipeline_manager.hinter diff --git a/src/dstack/_internal/settings.py b/src/dstack/_internal/settings.py index 6089e37c07..d94bb56547 100644 --- a/src/dstack/_internal/settings.py +++ b/src/dstack/_internal/settings.py @@ -47,3 +47,6 @@ class FeatureFlags: # DSTACK_FF_AUTOCREATED_FLEETS_ENABLED enables legacy autocreated fleets: # If there are no fleet suitable for the run, a new fleet is created automatically instead of an error. AUTOCREATED_FLEETS_ENABLED = os.getenv("DSTACK_FF_AUTOCREATED_FLEETS_ENABLED") is not None + # DSTACK_FF_PIPELINE_PROCESSING_ENABLED enables new pipeline-based processing tasks (background/pipeline_tasks/) + # instead of scheduler-based processing tasks (background/scheduled_tasks/) for tasks that implement pipelines. + PIPELINE_PROCESSING_ENABLED = os.getenv("DSTACK_FF_PIPELINE_PROCESSING_ENABLED") is not None diff --git a/src/dstack/_internal/server/background/tasks/__init__.py b/src/tests/_internal/server/background/pipeline_tasks/__init__.py similarity index 100% rename from src/dstack/_internal/server/background/tasks/__init__.py rename to src/tests/_internal/server/background/pipeline_tasks/__init__.py diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_base.py b/src/tests/_internal/server/background/pipeline_tasks/test_base.py new file mode 100644 index 0000000000..7e84d9f80d --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_base.py @@ -0,0 +1,183 @@ +import uuid +from datetime import datetime, timedelta, timezone +from unittest.mock import patch + +import pytest +from sqlalchemy import update +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.server.background.pipeline_tasks.base import Heartbeater, PipelineItem +from dstack._internal.server.models import PlacementGroupModel +from dstack._internal.server.testing.common import ( + create_fleet, + create_placement_group, + create_project, +) + + +@pytest.fixture +def now() -> datetime: + return datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + + +@pytest.fixture +def heartbeater() -> Heartbeater[PlacementGroupModel]: + return Heartbeater( + model_type=PlacementGroupModel, + lock_timeout=timedelta(seconds=30), + heartbeat_trigger=timedelta(seconds=5), + ) + + +async def _create_locked_placement_group( + session: AsyncSession, + now: datetime, + lock_expires_in: timedelta, +) -> PlacementGroupModel: + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + placement_group = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="test-pg", + ) + placement_group.lock_token = uuid.uuid4() + placement_group.lock_expires_at = now + lock_expires_in + await session.commit() + return placement_group + + +def _placement_group_to_pipeline_item(placement_group: PlacementGroupModel) -> PipelineItem: + assert placement_group.lock_token is not None + assert placement_group.lock_expires_at is not None + return PipelineItem( + __tablename__=PlacementGroupModel.__tablename__, + id=placement_group.id, + lock_token=placement_group.lock_token, + lock_expires_at=placement_group.lock_expires_at, + prev_lock_expired=False, + ) + + +class TestHeartbeater: + @pytest.mark.asyncio + async def test_untrack_preserves_item_when_lock_token_mismatches( + self, heartbeater: Heartbeater[PlacementGroupModel], now: datetime + ): + item = PipelineItem( + __tablename__=PlacementGroupModel.__tablename__, + id=uuid.uuid4(), + lock_token=uuid.uuid4(), + lock_expires_at=now + timedelta(seconds=10), + prev_lock_expired=True, + ) + await heartbeater.track(item) + + stale_item = PipelineItem( + __tablename__=PlacementGroupModel.__tablename__, + id=item.id, + lock_token=uuid.uuid4(), + lock_expires_at=item.lock_expires_at, + prev_lock_expired=False, + ) + await heartbeater.untrack(stale_item) + + assert item.id in heartbeater._items + await heartbeater.untrack(item) + assert item.id not in heartbeater._items + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_heartbeat_extends_locks_close_to_expiration( + self, + test_db, + session: AsyncSession, + heartbeater: Heartbeater[PlacementGroupModel], + now: datetime, + ): + placement_group = await _create_locked_placement_group( + session=session, + now=now, + lock_expires_in=timedelta(seconds=2), + ) + await heartbeater.track(_placement_group_to_pipeline_item(placement_group)) + + with patch( + "dstack._internal.server.background.pipeline_tasks.base.get_current_datetime", + return_value=now, + ): + await heartbeater.heartbeat() + + expected_lock_expires_at = now + timedelta(seconds=30) + tracked_item = heartbeater._items[placement_group.id] + assert tracked_item.lock_expires_at == expected_lock_expires_at + + await session.refresh(placement_group) + assert placement_group.lock_expires_at == expected_lock_expires_at + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_heartbeat_untracks_expired_items_without_db_update( + self, + test_db, + session: AsyncSession, + heartbeater: Heartbeater[PlacementGroupModel], + now: datetime, + ): + original_lock_expires_at = now - timedelta(seconds=1) + placement_group = await _create_locked_placement_group( + session=session, + now=now, + lock_expires_in=timedelta(seconds=-1), + ) + await heartbeater.track(_placement_group_to_pipeline_item(placement_group)) + + with patch( + "dstack._internal.server.background.pipeline_tasks.base.get_current_datetime", + return_value=now, + ): + await heartbeater.heartbeat() + + assert placement_group.id not in heartbeater._items + + await session.refresh(placement_group) + assert placement_group.lock_expires_at == original_lock_expires_at + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_heartbeat_untracks_item_when_lock_token_changed_in_db( + self, + test_db, + session: AsyncSession, + heartbeater: Heartbeater[PlacementGroupModel], + now: datetime, + ): + original_lock_expires_at = now + timedelta(seconds=2) + placement_group = await _create_locked_placement_group( + session=session, + now=now, + lock_expires_in=timedelta(seconds=2), + ) + await heartbeater.track(_placement_group_to_pipeline_item(placement_group)) + + new_lock_token = uuid.uuid4() + await session.execute( + update(PlacementGroupModel) + .where(PlacementGroupModel.id == placement_group.id) + .values(lock_token=new_lock_token) + .execution_options(synchronize_session=False) + ) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.base.get_current_datetime", + return_value=now, + ): + await heartbeater.heartbeat() + + assert placement_group.id not in heartbeater._items + + await session.refresh(placement_group) + assert placement_group.lock_token == new_lock_token + assert placement_group.lock_expires_at == original_lock_expires_at diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_compute_groups.py b/src/tests/_internal/server/background/pipeline_tasks/test_compute_groups.py new file mode 100644 index 0000000000..6d24669f7c --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_compute_groups.py @@ -0,0 +1,113 @@ +import uuid +from datetime import datetime, timezone +from unittest.mock import Mock, patch + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import BackendError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.compute_groups import ComputeGroupStatus +from dstack._internal.server.background.pipeline_tasks.base import PipelineItem +from dstack._internal.server.background.pipeline_tasks.compute_groups import ComputeGroupWorker +from dstack._internal.server.models import ComputeGroupModel +from dstack._internal.server.testing.common import ( + ComputeMockSpec, + create_compute_group, + create_fleet, + create_project, +) + + +@pytest.fixture +def worker() -> ComputeGroupWorker: + return ComputeGroupWorker(queue=Mock(), heartbeater=Mock()) + + +def _compute_group_to_pipeline_item(compute_group: ComputeGroupModel) -> PipelineItem: + assert compute_group.lock_token is not None + assert compute_group.lock_expires_at is not None + return PipelineItem( + __tablename__=compute_group.__tablename__, + id=compute_group.id, + lock_token=compute_group.lock_token, + lock_expires_at=compute_group.lock_expires_at, + prev_lock_expired=False, + ) + + +class TestComputeGroupWorker: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_terminates_compute_group( + self, test_db, session: AsyncSession, worker: ComputeGroupWorker + ): + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + compute_group = await create_compute_group( + session=session, + project=project, + fleet=fleet, + ) + compute_group.lock_token = uuid.uuid4() + compute_group.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value = compute_mock + m.return_value = backend_mock + backend_mock.TYPE = BackendType.RUNPOD + await worker.process(_compute_group_to_pipeline_item(compute_group)) + compute_mock.terminate_compute_group.assert_called_once() + await session.refresh(compute_group) + assert compute_group.status == ComputeGroupStatus.TERMINATED + assert compute_group.deleted + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_retries_compute_group_termination( + self, test_db, session: AsyncSession, worker: ComputeGroupWorker + ): + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + compute_group = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=datetime(2023, 1, 2, 3, 0, tzinfo=timezone.utc), + ) + compute_group.lock_token = uuid.uuid4() + compute_group.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value = compute_mock + m.return_value = backend_mock + backend_mock.TYPE = BackendType.RUNPOD + compute_mock.terminate_compute_group.side_effect = BackendError() + await worker.process(_compute_group_to_pipeline_item(compute_group)) + compute_mock.terminate_compute_group.assert_called_once() + await session.refresh(compute_group) + assert compute_group.status != ComputeGroupStatus.TERMINATED + assert compute_group.first_termination_retry_at is not None + assert compute_group.last_termination_retry_at is not None + # Simulate termination deadline exceeded + compute_group.first_termination_retry_at = datetime(2023, 1, 2, 3, 0, tzinfo=timezone.utc) + compute_group.last_termination_retry_at = datetime(2023, 1, 2, 4, 0, tzinfo=timezone.utc) + compute_group.last_processed_at = datetime(2023, 1, 2, 4, 0, tzinfo=timezone.utc) + compute_group.lock_token = uuid.uuid4() + compute_group.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + backend_mock = Mock() + compute_mock = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value = compute_mock + m.return_value = backend_mock + backend_mock.TYPE = BackendType.RUNPOD + compute_mock.terminate_compute_group.side_effect = BackendError() + await worker.process(_compute_group_to_pipeline_item(compute_group)) + compute_mock.terminate_compute_group.assert_called_once() + await session.refresh(compute_group) + assert compute_group.status == ComputeGroupStatus.TERMINATED diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py b/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py new file mode 100644 index 0000000000..87cab83e12 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py @@ -0,0 +1,63 @@ +import uuid +from datetime import datetime, timezone +from unittest.mock import Mock, patch + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.server.background.pipeline_tasks.base import PipelineItem +from dstack._internal.server.background.pipeline_tasks.placement_groups import PlacementGroupWorker +from dstack._internal.server.testing.common import ( + ComputeMockSpec, + create_fleet, + create_placement_group, + create_project, +) + + +@pytest.fixture +def worker() -> PlacementGroupWorker: + return PlacementGroupWorker(queue=Mock(), heartbeater=Mock()) + + +def _placement_group_to_pipeline_item(placement_group) -> PipelineItem: + assert placement_group.lock_token is not None + assert placement_group.lock_expires_at is not None + return PipelineItem( + __tablename__=placement_group.__tablename__, + id=placement_group.id, + lock_token=placement_group.lock_token, + lock_expires_at=placement_group.lock_expires_at, + prev_lock_expired=False, + ) + + +class TestPlacementGroupWorker: + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_deletes_placement_group( + self, test_db, session: AsyncSession, worker: PlacementGroupWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + ) + placement_group = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="test1-pg", + fleet_deleted=True, + ) + placement_group.lock_token = uuid.uuid4() + placement_group.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + aws_mock = Mock() + m.return_value = aws_mock + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + await worker.process(_placement_group_to_pipeline_item(placement_group)) + aws_mock.compute.return_value.delete_placement_group.assert_called_once() + await session.refresh(placement_group) + assert placement_group.deleted diff --git a/src/tests/_internal/server/background/tasks/__init__.py b/src/tests/_internal/server/background/scheduled_tasks/__init__.py similarity index 100% rename from src/tests/_internal/server/background/tasks/__init__.py rename to src/tests/_internal/server/background/scheduled_tasks/__init__.py diff --git a/src/tests/_internal/server/background/tasks/test_process_compute_groups.py b/src/tests/_internal/server/background/scheduled_tasks/test_compute_groups.py similarity index 96% rename from src/tests/_internal/server/background/tasks/test_process_compute_groups.py rename to src/tests/_internal/server/background/scheduled_tasks/test_compute_groups.py index 11ce734606..b2b1920199 100644 --- a/src/tests/_internal/server/background/tasks/test_process_compute_groups.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_compute_groups.py @@ -6,8 +6,8 @@ from dstack._internal.core.errors import BackendError from dstack._internal.core.models.backends.base import BackendType -from dstack._internal.server.background.tasks.process_compute_groups import ( - ComputeGroupStatus, +from dstack._internal.core.models.compute_groups import ComputeGroupStatus +from dstack._internal.server.background.scheduled_tasks.compute_groups import ( process_compute_groups, ) from dstack._internal.server.testing.common import ( diff --git a/src/tests/_internal/server/background/tasks/test_process_events.py b/src/tests/_internal/server/background/scheduled_tasks/test_events.py similarity index 94% rename from src/tests/_internal/server/background/tasks/test_process_events.py rename to src/tests/_internal/server/background/scheduled_tasks/test_events.py index 21043e0bae..91eb066f58 100644 --- a/src/tests/_internal/server/background/tasks/test_process_events.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_events.py @@ -6,7 +6,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.server import settings -from dstack._internal.server.background.tasks.process_events import delete_events +from dstack._internal.server.background.scheduled_tasks.events import delete_events from dstack._internal.server.services import events from dstack._internal.server.testing.common import create_user, list_events diff --git a/src/tests/_internal/server/background/tasks/test_process_fleets.py b/src/tests/_internal/server/background/scheduled_tasks/test_fleets.py similarity index 98% rename from src/tests/_internal/server/background/tasks/test_process_fleets.py rename to src/tests/_internal/server/background/scheduled_tasks/test_fleets.py index ae7155c3ca..2ef1b27ab9 100644 --- a/src/tests/_internal/server/background/tasks/test_process_fleets.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_fleets.py @@ -6,7 +6,7 @@ from dstack._internal.core.models.instances import InstanceStatus from dstack._internal.core.models.runs import RunStatus from dstack._internal.core.models.users import GlobalRole, ProjectRole -from dstack._internal.server.background.tasks.process_fleets import process_fleets +from dstack._internal.server.background.scheduled_tasks.fleets import process_fleets from dstack._internal.server.models import InstanceModel from dstack._internal.server.services.projects import add_project_member from dstack._internal.server.testing.common import ( diff --git a/src/tests/_internal/server/background/tasks/test_process_gateways.py b/src/tests/_internal/server/background/scheduled_tasks/test_gateways.py similarity index 98% rename from src/tests/_internal/server/background/tasks/test_process_gateways.py rename to src/tests/_internal/server/background/scheduled_tasks/test_gateways.py index b280b8948d..5f19d2cfcd 100644 --- a/src/tests/_internal/server/background/tasks/test_process_gateways.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_gateways.py @@ -5,7 +5,7 @@ from dstack._internal.core.errors import BackendError from dstack._internal.core.models.gateways import GatewayProvisioningData, GatewayStatus -from dstack._internal.server.background.tasks.process_gateways import process_gateways +from dstack._internal.server.background.scheduled_tasks.gateways import process_gateways from dstack._internal.server.testing.common import ( AsyncContextManager, ComputeMockSpec, diff --git a/src/tests/_internal/server/background/tasks/test_process_idle_volumes.py b/src/tests/_internal/server/background/scheduled_tasks/test_idle_volumes.py similarity index 98% rename from src/tests/_internal/server/background/tasks/test_process_idle_volumes.py rename to src/tests/_internal/server/background/scheduled_tasks/test_idle_volumes.py index 9d73afbb78..6a7acf0c43 100644 --- a/src/tests/_internal/server/background/tasks/test_process_idle_volumes.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_idle_volumes.py @@ -6,7 +6,7 @@ from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.volumes import VolumeStatus -from dstack._internal.server.background.tasks.process_idle_volumes import ( +from dstack._internal.server.background.scheduled_tasks.idle_volumes import ( _get_idle_time, _should_delete_volume, process_idle_volumes, diff --git a/src/tests/_internal/server/background/tasks/test_process_instances.py b/src/tests/_internal/server/background/scheduled_tasks/test_instances.py similarity index 96% rename from src/tests/_internal/server/background/tasks/test_process_instances.py rename to src/tests/_internal/server/background/scheduled_tasks/test_instances.py index 8d94ee059b..1b9789953e 100644 --- a/src/tests/_internal/server/background/tasks/test_process_instances.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_instances.py @@ -39,7 +39,7 @@ JobProvisioningData, JobStatus, ) -from dstack._internal.server.background.tasks.process_instances import ( +from dstack._internal.server.background.scheduled_tasks.instances import ( delete_instance_health_checks, process_instances, ) @@ -101,7 +101,7 @@ async def test_check_shim_transitions_provisioning_on_ready( await session.commit() with patch( - "dstack._internal.server.background.tasks.process_instances._check_instance_inner" + "dstack._internal.server.background.scheduled_tasks.instances._check_instance_inner" ) as healthcheck: healthcheck.return_value = InstanceCheck(reachable=True) await process_instances() @@ -130,7 +130,7 @@ async def test_check_shim_transitions_provisioning_on_terminating( health_reason = "Shim problem" with patch( - "dstack._internal.server.background.tasks.process_instances._check_instance_inner" + "dstack._internal.server.background.scheduled_tasks.instances._check_instance_inner" ) as healthcheck: healthcheck.return_value = InstanceCheck(reachable=False, message=health_reason) await process_instances() @@ -177,7 +177,7 @@ async def test_check_shim_transitions_provisioning_on_busy( await session.commit() with patch( - "dstack._internal.server.background.tasks.process_instances._check_instance_inner" + "dstack._internal.server.background.scheduled_tasks.instances._check_instance_inner" ) as healthcheck: healthcheck.return_value = InstanceCheck(reachable=True) await process_instances() @@ -202,7 +202,7 @@ async def test_check_shim_start_termination_deadline(self, test_db, session: Asy ) health_status = "SSH connection fail" with patch( - "dstack._internal.server.background.tasks.process_instances._check_instance_inner" + "dstack._internal.server.background.scheduled_tasks.instances._check_instance_inner" ) as healthcheck: healthcheck.return_value = InstanceCheck(reachable=False, message=health_status) await process_instances() @@ -232,7 +232,7 @@ async def test_check_shim_does_not_start_termination_deadline_with_ssh_instance( ) health_status = "SSH connection fail" with patch( - "dstack._internal.server.background.tasks.process_instances._check_instance_inner" + "dstack._internal.server.background.scheduled_tasks.instances._check_instance_inner" ) as healthcheck: healthcheck.return_value = InstanceCheck(reachable=False, message=health_status) await process_instances() @@ -257,7 +257,7 @@ async def test_check_shim_stop_termination_deadline(self, test_db, session: Asyn await session.commit() with patch( - "dstack._internal.server.background.tasks.process_instances._check_instance_inner" + "dstack._internal.server.background.scheduled_tasks.instances._check_instance_inner" ) as healthcheck: healthcheck.return_value = InstanceCheck(reachable=True) await process_instances() @@ -283,7 +283,7 @@ async def test_check_shim_terminate_instance_by_deadline(self, test_db, session: health_status = "Not ok" with patch( - "dstack._internal.server.background.tasks.process_instances._check_instance_inner" + "dstack._internal.server.background.scheduled_tasks.instances._check_instance_inner" ) as healthcheck: healthcheck.return_value = InstanceCheck(reachable=False, message=health_status) await process_instances() @@ -347,7 +347,7 @@ async def test_check_shim_process_ureachable_state( await session.commit() with patch( - "dstack._internal.server.background.tasks.process_instances._check_instance_inner" + "dstack._internal.server.background.scheduled_tasks.instances._check_instance_inner" ) as healthcheck: healthcheck.return_value = InstanceCheck(reachable=True) await process_instances() @@ -378,7 +378,7 @@ async def test_check_shim_switch_to_unreachable_state( ) with patch( - "dstack._internal.server.background.tasks.process_instances._check_instance_inner" + "dstack._internal.server.background.scheduled_tasks.instances._check_instance_inner" ) as healthcheck: healthcheck.return_value = InstanceCheck(reachable=False) await process_instances() @@ -412,7 +412,7 @@ async def test_check_shim_check_instance_health(self, test_db, session: AsyncSes ) with patch( - "dstack._internal.server.background.tasks.process_instances._check_instance_inner" + "dstack._internal.server.background.scheduled_tasks.instances._check_instance_inner" ) as healthcheck: healthcheck.return_value = InstanceCheck( reachable=True, health_response=health_response @@ -440,7 +440,7 @@ class TestRemoveDanglingTasks: @pytest.fixture def disable_maybe_install_components(self, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr( - "dstack._internal.server.background.tasks.process_instances._maybe_install_components", + "dstack._internal.server.background.scheduled_tasks.instances._maybe_install_components", Mock(return_value=None), ) @@ -607,7 +607,7 @@ def mock_terminate_in_backend(error: Optional[Exception] = None): if error is not None: terminate_instance.side_effect = error with patch( - "dstack._internal.server.background.tasks.process_instances.backends_services.get_project_backend_by_type" + "dstack._internal.server.background.scheduled_tasks.instances.backends_services.get_project_backend_by_type" ) as get_backend: get_backend.return_value = backend yield terminate_instance @@ -1153,7 +1153,7 @@ def host_info(self) -> dict: def deploy_instance_mock(self, monkeypatch: pytest.MonkeyPatch, host_info: dict): mock = Mock(return_value=(InstanceCheck(reachable=True), host_info, GoArchType.AMD64)) monkeypatch.setattr( - "dstack._internal.server.background.tasks.process_instances._deploy_instance", mock + "dstack._internal.server.background.scheduled_tasks.instances._deploy_instance", mock ) return mock @@ -1262,7 +1262,7 @@ def component_list(self) -> ComponentList: def debug_task_log(self, caplog: pytest.LogCaptureFixture) -> pytest.LogCaptureFixture: caplog.set_level( level=logging.DEBUG, - logger="dstack._internal.server.background.tasks.process_instances", + logger="dstack._internal.server.background.scheduled_tasks.instances", ) return caplog @@ -1308,7 +1308,7 @@ def component_list(self) -> ComponentList: def get_dstack_runner_version_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: mock = Mock(return_value=self.EXPECTED_VERSION) monkeypatch.setattr( - "dstack._internal.server.background.tasks.process_instances.get_dstack_runner_version", + "dstack._internal.server.background.scheduled_tasks.instances.get_dstack_runner_version", mock, ) return mock @@ -1317,7 +1317,7 @@ def get_dstack_runner_version_mock(self, monkeypatch: pytest.MonkeyPatch) -> Moc def get_dstack_runner_download_url_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: mock = Mock(return_value="https://example.com/runner") monkeypatch.setattr( - "dstack._internal.server.background.tasks.process_instances.get_dstack_runner_download_url", + "dstack._internal.server.background.scheduled_tasks.instances.get_dstack_runner_download_url", mock, ) return mock @@ -1424,7 +1424,7 @@ def component_list(self) -> ComponentList: def get_dstack_shim_version_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: mock = Mock(return_value=self.EXPECTED_VERSION) monkeypatch.setattr( - "dstack._internal.server.background.tasks.process_instances.get_dstack_shim_version", + "dstack._internal.server.background.scheduled_tasks.instances.get_dstack_shim_version", mock, ) return mock @@ -1433,7 +1433,7 @@ def get_dstack_shim_version_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: def get_dstack_shim_download_url_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: mock = Mock(return_value="https://example.com/shim") monkeypatch.setattr( - "dstack._internal.server.background.tasks.process_instances.get_dstack_shim_download_url", + "dstack._internal.server.background.scheduled_tasks.instances.get_dstack_shim_download_url", mock, ) return mock @@ -1547,7 +1547,7 @@ def component_list(self) -> ComponentList: def maybe_install_runner_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: mock = Mock(return_value=False) monkeypatch.setattr( - "dstack._internal.server.background.tasks.process_instances._maybe_install_runner", + "dstack._internal.server.background.scheduled_tasks.instances._maybe_install_runner", mock, ) return mock @@ -1556,7 +1556,7 @@ def maybe_install_runner_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: def maybe_install_shim_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: mock = Mock(return_value=False) monkeypatch.setattr( - "dstack._internal.server.background.tasks.process_instances._maybe_install_shim", + "dstack._internal.server.background.scheduled_tasks.instances._maybe_install_shim", mock, ) return mock diff --git a/src/tests/_internal/server/background/tasks/test_process_metrics.py b/src/tests/_internal/server/background/scheduled_tasks/test_metrics.py similarity index 98% rename from src/tests/_internal/server/background/tasks/test_process_metrics.py rename to src/tests/_internal/server/background/scheduled_tasks/test_metrics.py index 0be650a223..df52dd88e2 100644 --- a/src/tests/_internal/server/background/tasks/test_process_metrics.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_metrics.py @@ -10,7 +10,7 @@ from dstack._internal.core.models.runs import JobStatus from dstack._internal.core.models.users import GlobalRole, ProjectRole from dstack._internal.server import settings -from dstack._internal.server.background.tasks.process_metrics import ( +from dstack._internal.server.background.scheduled_tasks.metrics import ( collect_metrics, delete_metrics, ) diff --git a/src/tests/_internal/server/background/tasks/test_process_placement_groups.py b/src/tests/_internal/server/background/scheduled_tasks/test_placement_groups.py similarity index 94% rename from src/tests/_internal/server/background/tasks/test_process_placement_groups.py rename to src/tests/_internal/server/background/scheduled_tasks/test_placement_groups.py index a45051a48e..14b9d2189d 100644 --- a/src/tests/_internal/server/background/tasks/test_process_placement_groups.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_placement_groups.py @@ -3,7 +3,7 @@ import pytest from sqlalchemy.ext.asyncio import AsyncSession -from dstack._internal.server.background.tasks.process_placement_groups import ( +from dstack._internal.server.background.scheduled_tasks.placement_groups import ( process_placement_groups, ) from dstack._internal.server.testing.common import ( diff --git a/src/tests/_internal/server/background/tasks/test_process_probes.py b/src/tests/_internal/server/background/scheduled_tasks/test_probes.py similarity index 96% rename from src/tests/_internal/server/background/tasks/test_process_probes.py rename to src/tests/_internal/server/background/scheduled_tasks/test_probes.py index 928709dd7f..bfd569ab1b 100644 --- a/src/tests/_internal/server/background/tasks/test_process_probes.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_probes.py @@ -8,7 +8,7 @@ from dstack._internal.core.models.configurations import ProbeConfig, ServiceConfiguration from dstack._internal.core.models.instances import InstanceStatus from dstack._internal.core.models.runs import JobStatus -from dstack._internal.server.background.tasks.process_probes import ( +from dstack._internal.server.background.scheduled_tasks.probes import ( PROCESSING_OVERHEAD_TIMEOUT, SSH_CONNECT_TIMEOUT, process_probes, @@ -140,7 +140,7 @@ async def test_schedules_probe_execution(self, test_db, session: AsyncSession) - processing_time = datetime(2025, 1, 1, 0, 0, 1, tzinfo=timezone.utc) with freeze_time(processing_time): with patch( - "dstack._internal.server.background.tasks.process_probes.PROBES_SCHEDULER" + "dstack._internal.server.background.scheduled_tasks.probes.PROBES_SCHEDULER" ) as scheduler_mock: await process_probes() assert scheduler_mock.add_job.call_count == 2 @@ -210,7 +210,7 @@ async def test_deactivates_probe_when_until_ready_and_ready_after_reached( probe_regular = await create_probe(session, job, probe_num=1, success_streak=3) with patch( - "dstack._internal.server.background.tasks.process_probes.PROBES_SCHEDULER" + "dstack._internal.server.background.scheduled_tasks.probes.PROBES_SCHEDULER" ) as scheduler_mock: await process_probes() diff --git a/src/tests/_internal/server/background/tasks/test_process_prometheus_metrics.py b/src/tests/_internal/server/background/scheduled_tasks/test_prometheus_metrics.py similarity index 98% rename from src/tests/_internal/server/background/tasks/test_process_prometheus_metrics.py rename to src/tests/_internal/server/background/scheduled_tasks/test_prometheus_metrics.py index 7c59b6dd1f..80961d5c10 100644 --- a/src/tests/_internal/server/background/tasks/test_process_prometheus_metrics.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_prometheus_metrics.py @@ -11,7 +11,7 @@ from dstack._internal.core.models.instances import InstanceStatus from dstack._internal.core.models.runs import JobStatus from dstack._internal.core.models.users import GlobalRole, ProjectRole -from dstack._internal.server.background.tasks.process_prometheus_metrics import ( +from dstack._internal.server.background.scheduled_tasks.prometheus_metrics import ( collect_prometheus_metrics, delete_prometheus_metrics, ) diff --git a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py b/src/tests/_internal/server/background/scheduled_tasks/test_running_jobs.py similarity index 99% rename from src/tests/_internal/server/background/tasks/test_process_running_jobs.py rename to src/tests/_internal/server/background/scheduled_tasks/test_running_jobs.py index 12edeec208..0d748f4e91 100644 --- a/src/tests/_internal/server/background/tasks/test_process_running_jobs.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_running_jobs.py @@ -37,7 +37,7 @@ VolumeStatus, ) from dstack._internal.server import settings as server_settings -from dstack._internal.server.background.tasks.process_running_jobs import ( +from dstack._internal.server.background.scheduled_tasks.running_jobs import ( _patch_base_image_for_aws_efa, process_running_jobs, ) diff --git a/src/tests/_internal/server/background/tasks/test_process_runs.py b/src/tests/_internal/server/background/scheduled_tasks/test_runs.py similarity index 98% rename from src/tests/_internal/server/background/tasks/test_process_runs.py rename to src/tests/_internal/server/background/scheduled_tasks/test_runs.py index b9420d8e9a..ffb63de358 100644 --- a/src/tests/_internal/server/background/tasks/test_process_runs.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_runs.py @@ -8,7 +8,7 @@ from pydantic import parse_obj_as from sqlalchemy.ext.asyncio import AsyncSession -import dstack._internal.server.background.tasks.process_runs as process_runs +import dstack._internal.server.background.scheduled_tasks.runs as process_runs from dstack._internal.core.models.configurations import ( ProbeConfig, ServiceConfiguration, @@ -100,7 +100,7 @@ async def test_submitted_to_provisioning(self, test_db, session: AsyncSession): expected_duration = (current_time - run.submitted_at).total_seconds() with patch( - "dstack._internal.server.background.tasks.process_runs.run_metrics" + "dstack._internal.server.background.scheduled_tasks.runs.run_metrics" ) as mock_run_metrics: await process_runs.process_runs() @@ -131,7 +131,7 @@ async def test_keep_provisioning(self, test_db, session: AsyncSession): await create_job(session=session, run=run, status=JobStatus.PULLING) with patch( - "dstack._internal.server.background.tasks.process_runs.run_metrics" + "dstack._internal.server.background.scheduled_tasks.runs.run_metrics" ) as mock_run_metrics: await process_runs.process_runs() @@ -198,7 +198,7 @@ async def test_retry_running_to_pending(self, test_db, session: AsyncSession): with ( patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock, patch( - "dstack._internal.server.background.tasks.process_runs.run_metrics" + "dstack._internal.server.background.scheduled_tasks.runs.run_metrics" ) as mock_run_metrics, ): datetime_mock.return_value = run.submitted_at + datetime.timedelta(minutes=3) @@ -297,7 +297,7 @@ async def test_submitted_to_provisioning_if_any(self, test_db, session: AsyncSes expected_duration = (current_time - run.submitted_at).total_seconds() with patch( - "dstack._internal.server.background.tasks.process_runs.run_metrics" + "dstack._internal.server.background.scheduled_tasks.runs.run_metrics" ) as mock_run_metrics: await process_runs.process_runs() @@ -351,7 +351,7 @@ async def test_all_no_capacity_to_pending(self, test_db, session: AsyncSession): with ( patch("dstack._internal.utils.common.get_current_datetime") as datetime_mock, patch( - "dstack._internal.server.background.tasks.process_runs.run_metrics" + "dstack._internal.server.background.scheduled_tasks.runs.run_metrics" ) as mock_run_metrics, ): datetime_mock.return_value = run.submitted_at + datetime.timedelta(minutes=3) diff --git a/src/tests/_internal/server/background/tasks/test_process_submitted_jobs.py b/src/tests/_internal/server/background/scheduled_tasks/test_submitted_jobs.py similarity index 99% rename from src/tests/_internal/server/background/tasks/test_process_submitted_jobs.py rename to src/tests/_internal/server/background/scheduled_tasks/test_submitted_jobs.py index 8a3a4b1d57..b06eb50ec2 100644 --- a/src/tests/_internal/server/background/tasks/test_process_submitted_jobs.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_submitted_jobs.py @@ -27,7 +27,7 @@ VolumeMountPoint, VolumeStatus, ) -from dstack._internal.server.background.tasks.process_submitted_jobs import ( +from dstack._internal.server.background.scheduled_tasks.submitted_jobs import ( _prepare_job_runtime_data, process_submitted_jobs, ) diff --git a/src/tests/_internal/server/background/tasks/test_process_submitted_volumes.py b/src/tests/_internal/server/background/scheduled_tasks/test_submitted_volumes.py similarity index 96% rename from src/tests/_internal/server/background/tasks/test_process_submitted_volumes.py rename to src/tests/_internal/server/background/scheduled_tasks/test_submitted_volumes.py index dfeef1e42e..8c9a6bf3cf 100644 --- a/src/tests/_internal/server/background/tasks/test_process_submitted_volumes.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_submitted_volumes.py @@ -5,7 +5,7 @@ from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.volumes import VolumeProvisioningData, VolumeStatus -from dstack._internal.server.background.tasks.process_volumes import process_submitted_volumes +from dstack._internal.server.background.scheduled_tasks.volumes import process_submitted_volumes from dstack._internal.server.testing.common import ( ComputeMockSpec, create_project, diff --git a/src/tests/_internal/server/background/tasks/test_process_terminating_jobs.py b/src/tests/_internal/server/background/scheduled_tasks/test_terminating_jobs.py similarity index 99% rename from src/tests/_internal/server/background/tasks/test_process_terminating_jobs.py rename to src/tests/_internal/server/background/scheduled_tasks/test_terminating_jobs.py index 1d1c143d4f..d2b4d2d318 100644 --- a/src/tests/_internal/server/background/tasks/test_process_terminating_jobs.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_terminating_jobs.py @@ -10,7 +10,7 @@ from dstack._internal.core.models.instances import InstanceStatus from dstack._internal.core.models.runs import JobStatus, JobTerminationReason from dstack._internal.core.models.volumes import VolumeStatus -from dstack._internal.server.background.tasks.process_terminating_jobs import ( +from dstack._internal.server.background.scheduled_tasks.terminating_jobs import ( process_terminating_jobs, ) from dstack._internal.server.models import InstanceModel, JobModel, VolumeAttachmentModel diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 0c5ca338df..25cbbead36 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -12,6 +12,7 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from dstack._internal import settings from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.common import ApplyAction from dstack._internal.core.models.configurations import ( @@ -71,7 +72,6 @@ list_events, ) from dstack._internal.server.testing.matchers import SomeUUID4Str -from tests._internal.server.background.tasks.test_process_running_jobs import settings pytestmark = pytest.mark.usefixtures("image_config_mock") From 3aae583a6393b554624564eff6676913deba782b Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Wed, 18 Feb 2026 16:38:41 +0545 Subject: [PATCH 145/187] Add pd disaggregated inference (#3558) * Initial PD disaggregation implementation Test2 Internal IP Test Add worker with internal_ip Check status and register Add Status Ready Log Add Prefill-Decode Add PD to dstack Test register worker without poll Add router config in service config Update remove worker Clean Up router code Clean Up Further Cleanup * Add pd disaggregation service * Move router configuration to service * Resolve major comments * Resolve Lint Error * Minor Update * Resolve Minor Comments * Update wheel url * Resolve backward incompatibility * Update RouterConfigs * Resolve Lint Error * Update gateway wheel * Minor Update --------- Co-authored-by: Bihan Rana --- docs/docs/reference/dstack.yml/gateway.md | 2 +- gateway/pyproject.toml | 2 +- .../_internal/core/backends/base/compute.py | 10 +- .../core/backends/kubernetes/compute.py | 4 +- .../_internal/core/compatibility/gateways.py | 6 +- .../_internal/core/compatibility/runs.py | 7 ++ .../_internal/core/models/configurations.py | 9 ++ src/dstack/_internal/core/models/gateways.py | 13 ++- src/dstack/_internal/core/models/routers.py | 27 +++++- .../proxy/gateway/routers/registry.py | 1 + .../proxy/gateway/schemas/registry.py | 5 +- .../services/model_routers/__init__.py | 4 +- .../gateway/services/model_routers/base.py | 4 +- .../gateway/services/model_routers/sglang.py | 80 +++++++++++++--- .../_internal/proxy/gateway/services/nginx.py | 92 +++++++++++-------- .../proxy/gateway/services/registry.py | 45 +++++++-- src/dstack/_internal/proxy/lib/models.py | 5 +- .../server/services/gateways/client.py | 5 +- .../_internal/server/services/proxy/repo.py | 1 + .../server/services/services/__init__.py | 62 ++++++++++++- 20 files changed, 294 insertions(+), 90 deletions(-) diff --git a/docs/docs/reference/dstack.yml/gateway.md b/docs/docs/reference/dstack.yml/gateway.md index b8e2742891..1d74c95705 100644 --- a/docs/docs/reference/dstack.yml/gateway.md +++ b/docs/docs/reference/dstack.yml/gateway.md @@ -14,7 +14,7 @@ The `gateway` configuration type allows creating and updating [gateways](../../c === "SGLang Model Gateway" - #SCHEMA# dstack._internal.core.models.routers.SGLangRouterConfig + #SCHEMA# dstack._internal.core.models.routers.SGLangGatewayRouterConfig overrides: show_root_heading: false type: diff --git a/gateway/pyproject.toml b/gateway/pyproject.toml index c40a37b7f5..6c4d406a6f 100644 --- a/gateway/pyproject.toml +++ b/gateway/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ ] [project.optional-dependencies] -sglang = ["sglang-router==0.2.1"] +sglang = ["sglang-router==0.3.2"] [tool.setuptools.package-data] "dstack.gateway" = [ diff --git a/src/dstack/_internal/core/backends/base/compute.py b/src/dstack/_internal/core/backends/base/compute.py index 49513e3211..a2507f4240 100644 --- a/src/dstack/_internal/core/backends/base/compute.py +++ b/src/dstack/_internal/core/backends/base/compute.py @@ -39,7 +39,7 @@ SSHKey, ) from dstack._internal.core.models.placement import PlacementGroup, PlacementGroupProvisioningData -from dstack._internal.core.models.routers import AnyRouterConfig +from dstack._internal.core.models.routers import AnyGatewayRouterConfig from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run from dstack._internal.core.models.volumes import ( Volume, @@ -924,7 +924,9 @@ def get_run_shim_script( ] -def get_gateway_user_data(authorized_key: str, router: Optional[AnyRouterConfig] = None) -> str: +def get_gateway_user_data( + authorized_key: str, router: Optional[AnyGatewayRouterConfig] = None +) -> str: return get_cloud_config( package_update=True, packages=[ @@ -1036,7 +1038,7 @@ def get_latest_runner_build() -> Optional[str]: return None -def get_dstack_gateway_wheel(build: str, router: Optional[AnyRouterConfig] = None) -> str: +def get_dstack_gateway_wheel(build: str, router: Optional[AnyGatewayRouterConfig] = None) -> str: channel = "release" if settings.DSTACK_RELEASE else "stgn" base_url = f"https://dstack-gateway-downloads.s3.amazonaws.com/{channel}" if build == "latest": @@ -1049,7 +1051,7 @@ def get_dstack_gateway_wheel(build: str, router: Optional[AnyRouterConfig] = Non return f"dstack-gateway @ {wheel}" -def get_dstack_gateway_commands(router: Optional[AnyRouterConfig] = None) -> List[str]: +def get_dstack_gateway_commands(router: Optional[AnyGatewayRouterConfig] = None) -> List[str]: build = get_dstack_runner_version() or "latest" gateway_package = get_dstack_gateway_wheel(build, router) return [ diff --git a/src/dstack/_internal/core/backends/kubernetes/compute.py b/src/dstack/_internal/core/backends/kubernetes/compute.py index 51abddc70c..870b6bb657 100644 --- a/src/dstack/_internal/core/backends/kubernetes/compute.py +++ b/src/dstack/_internal/core/backends/kubernetes/compute.py @@ -66,7 +66,7 @@ ) from dstack._internal.core.models.placement import PlacementGroup from dstack._internal.core.models.resources import CPUSpec, GPUSpec -from dstack._internal.core.models.routers import AnyRouterConfig +from dstack._internal.core.models.routers import AnyGatewayRouterConfig from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run from dstack._internal.core.models.volumes import Volume from dstack._internal.utils.common import get_or_error @@ -864,7 +864,7 @@ def _wait_for_load_balancer_address( def _get_gateway_commands( - authorized_keys: List[str], router: Optional[AnyRouterConfig] = None + authorized_keys: List[str], router: Optional[AnyGatewayRouterConfig] = None ) -> List[str]: authorized_keys_content = "\n".join(authorized_keys).strip() gateway_commands = " && ".join(get_dstack_gateway_commands(router=router)) diff --git a/src/dstack/_internal/core/compatibility/gateways.py b/src/dstack/_internal/core/compatibility/gateways.py index de94f6a18e..949d6515f8 100644 --- a/src/dstack/_internal/core/compatibility/gateways.py +++ b/src/dstack/_internal/core/compatibility/gateways.py @@ -31,9 +31,7 @@ def _get_gateway_configuration_excludes( ) -> IncludeExcludeDictType: configuration_excludes: IncludeExcludeDictType = {} - # Add excludes like this: - # - # if configuration.tags is None: - # configuration_excludes["tags"] = True + if configuration.router is None: + configuration_excludes["router"] = True return configuration_excludes diff --git a/src/dstack/_internal/core/compatibility/runs.py b/src/dstack/_internal/core/compatibility/runs.py index 19c08cde55..4ece12392c 100644 --- a/src/dstack/_internal/core/compatibility/runs.py +++ b/src/dstack/_internal/core/compatibility/runs.py @@ -2,6 +2,7 @@ from dstack._internal.core.models.common import IncludeExcludeDictType, IncludeExcludeSetType from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.routers import SGLangServiceRouterConfig from dstack._internal.core.models.runs import ( DEFAULT_PROBE_UNTIL_READY, DEFAULT_REPLICA_GROUP_NAME, @@ -72,6 +73,12 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType: # Servers prior to 0.20.8 do not support probes=None configuration_excludes["probes"] = True + router = run_spec.configuration.router + if router is None: + configuration_excludes["router"] = True + elif isinstance(router, SGLangServiceRouterConfig) and router.pd_disaggregation is False: + configuration_excludes["router"] = {"pd_disaggregation": True} + if configuration_excludes: spec_excludes["configuration"] = configuration_excludes if profile_excludes: diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 040c382359..93c63e6b31 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -28,6 +28,7 @@ parse_off_duration, ) from dstack._internal.core.models.resources import Range, ResourcesSpec +from dstack._internal.core.models.routers import AnyServiceRouterConfig from dstack._internal.core.models.services import AnyModel, OpenAIChatModel from dstack._internal.core.models.unix import UnixUser from dstack._internal.core.models.volumes import MountPoint, VolumeConfiguration, parse_mount_point @@ -887,6 +888,14 @@ class ServiceConfigurationParams(CoreModel): ) ), ] = None + router: Annotated[ + Optional[AnyServiceRouterConfig], + Field( + description=( + "Router configuration for the service. Requires a gateway with matching router enabled. " + ), + ), + ] = None @validator("port") def convert_port(cls, v) -> PortMapping: diff --git a/src/dstack/_internal/core/models/gateways.py b/src/dstack/_internal/core/models/gateways.py index b342c0a73b..816395fc82 100644 --- a/src/dstack/_internal/core/models/gateways.py +++ b/src/dstack/_internal/core/models/gateways.py @@ -8,7 +8,7 @@ from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.common import CoreModel -from dstack._internal.core.models.routers import AnyRouterConfig +from dstack._internal.core.models.routers import AnyGatewayRouterConfig from dstack._internal.utils.tags import tags_validator @@ -63,8 +63,13 @@ class GatewayConfiguration(CoreModel): ), ] = None router: Annotated[ - Optional[AnyRouterConfig], - Field(description="The router configuration"), + Optional[AnyGatewayRouterConfig], + Field( + description=( + "The router configuration for this gateway. " + "E.g. `{ type: sglang, policy: round_robin }`." + ), + ), ] = None domain: Annotated[ Optional[str], Field(description="The gateway domain, e.g. `example.com`") @@ -134,7 +139,7 @@ class GatewayComputeConfiguration(CoreModel): ssh_key_pub: str certificate: Optional[AnyGatewayCertificate] = None tags: Optional[Dict[str, str]] = None - router: Optional[AnyRouterConfig] = None + router: Optional[AnyGatewayRouterConfig] = None class GatewayProvisioningData(CoreModel): diff --git a/src/dstack/_internal/core/models/routers.py b/src/dstack/_internal/core/models/routers.py index e07631e12e..49769fb8f1 100644 --- a/src/dstack/_internal/core/models/routers.py +++ b/src/dstack/_internal/core/models/routers.py @@ -11,7 +11,25 @@ class RouterType(str, Enum): SGLANG = "sglang" -class SGLangRouterConfig(CoreModel): +class SGLangGatewayRouterConfig(CoreModel): + """Gateway-level router configuration. type and policy only. pd_disaggregation is service-level.""" + + type: Annotated[ + Literal["sglang"], + Field(description="The router type enabled on this gateway."), + ] = "sglang" + policy: Annotated[ + Literal["random", "round_robin", "cache_aware", "power_of_two"], + Field( + description=( + "The routing policy. Deprecated: prefer setting policy in the service's router config. " + "Options: `random`, `round_robin`, `cache_aware`, `power_of_two`" + ), + ), + ] = "cache_aware" + + +class SGLangServiceRouterConfig(CoreModel): type: Annotated[Literal["sglang"], Field(description="The router type")] = "sglang" policy: Annotated[ Literal["random", "round_robin", "cache_aware", "power_of_two"], @@ -19,6 +37,11 @@ class SGLangRouterConfig(CoreModel): description="The routing policy. Options: `random`, `round_robin`, `cache_aware`, `power_of_two`" ), ] = "cache_aware" + pd_disaggregation: Annotated[ + bool, + Field(description="Enable PD disaggregation mode for the SGLang router"), + ] = False -AnyRouterConfig = SGLangRouterConfig +AnyServiceRouterConfig = SGLangServiceRouterConfig +AnyGatewayRouterConfig = SGLangGatewayRouterConfig diff --git a/src/dstack/_internal/proxy/gateway/routers/registry.py b/src/dstack/_internal/proxy/gateway/routers/registry.py index dd4f63f325..c5f4cf8a1a 100644 --- a/src/dstack/_internal/proxy/gateway/routers/registry.py +++ b/src/dstack/_internal/proxy/gateway/routers/registry.py @@ -80,6 +80,7 @@ async def register_replica( ssh_proxy=body.ssh_proxy, ssh_head_proxy=body.ssh_head_proxy, ssh_head_proxy_private_key=body.ssh_head_proxy_private_key, + internal_ip=body.internal_ip, repo=repo, nginx=nginx, service_conn_pool=service_conn_pool, diff --git a/src/dstack/_internal/proxy/gateway/schemas/registry.py b/src/dstack/_internal/proxy/gateway/schemas/registry.py index 53a29f68ca..802d23a700 100644 --- a/src/dstack/_internal/proxy/gateway/schemas/registry.py +++ b/src/dstack/_internal/proxy/gateway/schemas/registry.py @@ -3,7 +3,7 @@ from pydantic import BaseModel, Field from dstack._internal.core.models.instances import SSHConnectionParams -from dstack._internal.core.models.routers import AnyRouterConfig +from dstack._internal.core.models.routers import AnyServiceRouterConfig from dstack._internal.proxy.lib.models import RateLimit @@ -45,7 +45,7 @@ class RegisterServiceRequest(BaseModel): options: Options ssh_private_key: str rate_limits: tuple[RateLimit, ...] = () - router: Optional[AnyRouterConfig] = None + router: Optional[AnyServiceRouterConfig] = None class RegisterReplicaRequest(BaseModel): @@ -56,6 +56,7 @@ class RegisterReplicaRequest(BaseModel): ssh_proxy: Optional[SSHConnectionParams] ssh_head_proxy: Optional[SSHConnectionParams] ssh_head_proxy_private_key: Optional[str] + internal_ip: Optional[str] = None class RegisterEntrypointRequest(BaseModel): diff --git a/src/dstack/_internal/proxy/gateway/services/model_routers/__init__.py b/src/dstack/_internal/proxy/gateway/services/model_routers/__init__.py index 9678699ac6..43477d2d3f 100644 --- a/src/dstack/_internal/proxy/gateway/services/model_routers/__init__.py +++ b/src/dstack/_internal/proxy/gateway/services/model_routers/__init__.py @@ -1,11 +1,11 @@ -from dstack._internal.core.models.routers import AnyRouterConfig, RouterType +from dstack._internal.core.models.routers import AnyServiceRouterConfig, RouterType from dstack._internal.proxy.gateway.services.model_routers.sglang import SglangRouter from dstack._internal.proxy.lib.errors import ProxyError from .base import Router, RouterContext -def get_router(router: AnyRouterConfig, context: RouterContext) -> Router: +def get_router(router: AnyServiceRouterConfig, context: RouterContext) -> Router: if router.type == RouterType.SGLANG: return SglangRouter(config=router, context=context) raise ProxyError(f"Router type '{router.type}' is not available") diff --git a/src/dstack/_internal/proxy/gateway/services/model_routers/base.py b/src/dstack/_internal/proxy/gateway/services/model_routers/base.py index 867591ca13..83ec14cb4d 100644 --- a/src/dstack/_internal/proxy/gateway/services/model_routers/base.py +++ b/src/dstack/_internal/proxy/gateway/services/model_routers/base.py @@ -4,7 +4,7 @@ from pydantic import BaseModel -from dstack._internal.core.models.routers import AnyRouterConfig +from dstack._internal.core.models.routers import AnyServiceRouterConfig class RouterContext(BaseModel): @@ -29,7 +29,7 @@ class Router(ABC): def __init__( self, context: RouterContext, - config: Optional[AnyRouterConfig] = None, + config: Optional[AnyServiceRouterConfig] = None, ): """Initialize router with context. diff --git a/src/dstack/_internal/proxy/gateway/services/model_routers/sglang.py b/src/dstack/_internal/proxy/gateway/services/model_routers/sglang.py index c3a0dfaae9..c1c03c5a11 100644 --- a/src/dstack/_internal/proxy/gateway/services/model_routers/sglang.py +++ b/src/dstack/_internal/proxy/gateway/services/model_routers/sglang.py @@ -2,13 +2,12 @@ import subprocess import sys import time -import urllib.parse from typing import List, Optional import httpx import psutil -from dstack._internal.core.models.routers import RouterType, SGLangRouterConfig +from dstack._internal.core.models.routers import AnyServiceRouterConfig, RouterType from dstack._internal.proxy.lib.errors import UnexpectedProxyError from dstack._internal.utils.logging import get_logger @@ -22,7 +21,7 @@ class SglangRouter(Router): TYPE = RouterType.SGLANG - def __init__(self, config: SGLangRouterConfig, context: RouterContext): + def __init__(self, config: AnyServiceRouterConfig, context: RouterContext): """Initialize SGLang router. Args: @@ -68,6 +67,8 @@ def start(self) -> None: "--policy", self.config.policy, ] + if self.config.pd_disaggregation: + cmd.append("--pd-disaggregation") subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) @@ -174,7 +175,7 @@ def update_replicas(self, replica_urls: List[str]) -> None: # Add workers for worker_url in sorted(workers_to_add): - success = self._add_worker_to_router(worker_url) + success = self._register_worker(worker_url) if not success: logger.warning("Failed to add worker %s, continuing with others", worker_url) @@ -197,9 +198,16 @@ def _get_router_workers(self) -> List[dict]: logger.exception("Error getting sglang router workers") return [] - def _add_worker_to_router(self, worker_url: str) -> bool: + def _add_worker_to_router( + self, + url: str, + worker_type: str = "regular", + bootstrap_port: Optional[int] = None, + ) -> bool: try: - payload = {"url": worker_url, "worker_type": "regular"} + payload: dict = {"url": url, "worker_type": worker_type} + if bootstrap_port is not None: + payload["bootstrap_port"] = bootstrap_port with httpx.Client(timeout=5.0) as client: response = client.post( f"http://{self.context.host}:{self.context.port}/workers", @@ -209,8 +217,9 @@ def _add_worker_to_router(self, worker_url: str) -> bool: response_data = response.json() if response_data.get("status") == "accepted": logger.info( - "Worker %s accepted by sglang router on port %s", - worker_url, + "Worker %s (type=%s) accepted by sglang router on port %s", + url, + worker_type, self.context.port, ) return True @@ -224,21 +233,68 @@ def _add_worker_to_router(self, worker_url: str) -> bool: else: logger.error( "Failed to add worker %s: status %d, %s", - worker_url, + url, response.status_code, response.text, ) return False except Exception: - logger.exception("Error adding worker %s", worker_url) + logger.exception("Error adding worker %s", url) + return False + + def _register_worker(self, url: str) -> bool: + if not self.config.pd_disaggregation: + return self._add_worker_to_router(url, "regular", None) + + server_info_url = f"{url}/server_info" + try: + with httpx.Client(timeout=10) as client: + resp = client.get(server_info_url) + if resp.status_code != 200: + return False + data = resp.json() + if data.get("status") != "ready": + return False + disaggregation_mode = data.get("disaggregation_mode", "") + if disaggregation_mode == "prefill": + worker_type = "prefill" + bootstrap_port = data.get("disaggregation_bootstrap_port") + elif disaggregation_mode == "decode": + worker_type = "decode" + bootstrap_port = None + else: + worker_type = "regular" + bootstrap_port = None + logger.info( + "Registering worker %s (type=%s)", + url, + worker_type, + ) + return self._add_worker_to_router( + url, + worker_type, + bootstrap_port, + ) + except Exception: + logger.exception("Error registering worker %s", url) return False def _remove_worker_from_router(self, worker_url: str) -> bool: try: - encoded_url = urllib.parse.quote(worker_url, safe="") + current_workers = self._get_router_workers() + worker_id = None + for worker in current_workers: + url = worker.get("url") + if url and isinstance(url, str) and url == worker_url: + worker_id = worker.get("id") + if worker_id and isinstance(worker_id, str): + break + if not worker_id: + logger.error("No worker id found for url %s", worker_url) + return False with httpx.Client(timeout=5.0) as client: response = client.delete( - f"http://{self.context.host}:{self.context.port}/workers/{encoded_url}" + f"http://{self.context.host}:{self.context.port}/workers/{worker_id}" ) if response.status_code == 202: response_data = response.json() diff --git a/src/dstack/_internal/proxy/gateway/services/nginx.py b/src/dstack/_internal/proxy/gateway/services/nginx.py index c971d4197a..47b93d074d 100644 --- a/src/dstack/_internal/proxy/gateway/services/nginx.py +++ b/src/dstack/_internal/proxy/gateway/services/nginx.py @@ -5,12 +5,13 @@ from asyncio import Lock from pathlib import Path from typing import Dict, Optional +from urllib.parse import urlparse import jinja2 from pydantic import BaseModel from typing_extensions import Literal -from dstack._internal.core.models.routers import AnyRouterConfig, RouterType +from dstack._internal.core.models.routers import AnyServiceRouterConfig, RouterType from dstack._internal.proxy.gateway.const import PROXY_PORT_ON_GATEWAY from dstack._internal.proxy.gateway.models import ACMESettings from dstack._internal.proxy.gateway.services.model_routers import ( @@ -18,6 +19,7 @@ RouterContext, get_router, ) +from dstack._internal.proxy.lib import models from dstack._internal.proxy.lib.errors import ProxyError, UnexpectedProxyError from dstack._internal.utils.common import run_async from dstack._internal.utils.logging import get_logger @@ -43,6 +45,8 @@ def render(self) -> str: class ReplicaConfig(BaseModel): id: str socket: Path + port: int + internal_ip: Optional[str] = None class LimitReqZoneConfig(BaseModel): @@ -70,7 +74,7 @@ class ServiceConfig(SiteConfig): limit_req_zones: list[LimitReqZoneConfig] locations: list[LocationConfig] replicas: list[ReplicaConfig] - router: Optional[AnyRouterConfig] = None + router: Optional[AnyServiceRouterConfig] = None router_port: Optional[int] = None cors_enabled: bool = False @@ -96,7 +100,7 @@ def __init__(self, conf_dir: Path = Path("/etc/nginx/sites-enabled")) -> None: self._next_router_port: int = self._ROUTER_PORT_MIN # Tracking of worker ports to avoid conflicts across router instances self._allocated_worker_ports: set[int] = set() - self._domain_to_worker_ports: Dict[str, list[int]] = {} + self._domain_to_worker_urls: Dict[str, list[str]] = {} self._next_worker_port: int = self._WORKER_PORT_MIN async def register(self, conf: SiteConfig, acme: ACMESettings) -> None: @@ -145,33 +149,40 @@ async def register(self, conf: SiteConfig, acme: ACMESettings) -> None: del self._domain_to_router[conf.domain] raise - allocated_ports = self._allocate_worker_ports(len(conf.replicas)) - replica_urls = [ - f"http://{router.context.host}:{port}" for port in allocated_ports - ] - - # Write router workers config - try: + if conf.router.pd_disaggregation: + # PD path: replica_urls from internal_ip (router talks directly to workers) + if any(not r.internal_ip for r in conf.replicas): + raise ProxyError( + "PD disaggregation requires internal IP for all replicas." + ) + replica_urls = [ + f"http://{replica.internal_ip}:{replica.port}" + for replica in conf.replicas + ] + self._domain_to_worker_urls[conf.domain] = replica_urls + else: + # Non-PD path: allocate gateway-local ports, nginx proxies to replica sockets + allocated_ports = self._allocate_worker_ports(len(conf.replicas)) + replica_urls = [ + f"http://{router.context.host}:{port}" for port in allocated_ports + ] if conf.replicas: - await run_async(self.write_router_workers_conf, conf, allocated_ports) - # Discard old worker ports if domain already has allocated ports (required for scaling case) - if conf.domain in self._domain_to_worker_ports: - old_worker_ports = self._domain_to_worker_ports[conf.domain] - for port in old_worker_ports: - self._allocated_worker_ports.discard(port) - self._domain_to_worker_ports[conf.domain] = allocated_ports - except Exception as e: - logger.exception( - "write_router_workers_conf failed for domain=%s: %s", conf.domain, e - ) - raise + await run_async( + self.write_router_workers_conf, + conf, + allocated_ports, + ) + if conf.domain in self._domain_to_worker_urls: + self._discard_ports(self._domain_to_worker_urls[conf.domain]) + self._domain_to_worker_urls[conf.domain] = replica_urls - # Update replicas to router (actual HTTP API calls to add workers) try: await run_async(router.update_replicas, replica_urls) except Exception as e: logger.exception( - "Failed to add replicas to router for domain=%s: %s", conf.domain, e + "Failed to add replicas to router for domain=%s: %s", + conf.domain, + e, ) raise @@ -179,7 +190,8 @@ async def register(self, conf: SiteConfig, acme: ACMESettings) -> None: logger.info("Registered %s domain %s", conf.type, conf.domain) - async def unregister(self, domain: str) -> None: + async def unregister(self, service: models.Service) -> None: + domain = service.domain_safe logger.debug("Unregistering domain %s", domain) conf_path = self._conf_dir / self.get_config_name(domain) if not conf_path.exists(): @@ -190,12 +202,16 @@ async def unregister(self, domain: str) -> None: if domain in self._domain_to_router: router = self._domain_to_router[domain] # Remove all workers for this domain - if domain in self._domain_to_worker_ports: - worker_ports = self._domain_to_worker_ports[domain] - replica_urls = [ - f"http://{router.context.host}:{port}" for port in worker_ports - ] - await run_async(router.remove_replicas, replica_urls) + if domain in self._domain_to_worker_urls: + worker_urls = self._domain_to_worker_urls[domain] + await run_async(router.remove_replicas, worker_urls) + pd_disaggregation = ( + service.router.pd_disaggregation if service.router else False + ) + if not pd_disaggregation: + self._discard_ports(worker_urls) + del self._domain_to_worker_urls[domain] + logger.debug("Removed worker URLs for domain %s", domain) # Stop and kill the router await run_async(router.stop) # Remove from mappings @@ -204,14 +220,6 @@ async def unregister(self, domain: str) -> None: del self._router_port_to_domain[router_port] del self._domain_to_router[domain] - # Discard worker ports for this domain - if domain in self._domain_to_worker_ports: - worker_ports = self._domain_to_worker_ports[domain] - for port in worker_ports: - self._allocated_worker_ports.discard(port) - del self._domain_to_worker_ports[domain] - logger.debug("Freed worker ports %s for domain %s", worker_ports, domain) - # Remove workers config file workers_conf_path = self._conf_dir / f"router-workers.{domain}.conf" if workers_conf_path.exists(): @@ -404,6 +412,12 @@ def _allocate_worker_ports(self, num_ports: int) -> list[int]: return allocated + def _discard_ports(self, urls: list[str]) -> None: + for u in urls: + parsed = urlparse(u) + if parsed.port is not None and parsed.port in self._allocated_worker_ports: + self._allocated_worker_ports.discard(parsed.port) + def write_global_conf(self) -> None: conf = read_package_resource("00-log-format.conf") self.write_conf(conf, "00-log-format.conf") diff --git a/src/dstack/_internal/proxy/gateway/services/registry.py b/src/dstack/_internal/proxy/gateway/services/registry.py index 036b864396..dc6407d245 100644 --- a/src/dstack/_internal/proxy/gateway/services/registry.py +++ b/src/dstack/_internal/proxy/gateway/services/registry.py @@ -6,7 +6,7 @@ import dstack._internal.proxy.gateway.schemas.registry as schemas from dstack._internal.core.models.instances import SSHConnectionParams -from dstack._internal.core.models.routers import AnyRouterConfig, RouterType +from dstack._internal.core.models.routers import AnyServiceRouterConfig, RouterType from dstack._internal.proxy.gateway import models as gateway_models from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo from dstack._internal.proxy.gateway.services.nginx import ( @@ -45,7 +45,7 @@ async def register_service( repo: GatewayProxyRepo, nginx: Nginx, service_conn_pool: ServiceConnectionPool, - router: Optional[AnyRouterConfig] = None, + router: Optional[AnyServiceRouterConfig] = None, ) -> None: cors_enabled = model is not None and model.type == "chat" and model.format == "openai" service = models.Service( @@ -118,7 +118,7 @@ async def unregister_service( ids=(r.id for r in service.replicas), service_conn_pool=service_conn_pool, ) - await nginx.unregister(service.domain_safe) + await nginx.unregister(service) await repo.delete_models_by_run(project_name, run_name) await repo.delete_service(project_name, run_name) @@ -138,6 +138,7 @@ async def register_replica( repo: GatewayProxyRepo, nginx: Nginx, service_conn_pool: ServiceConnectionPool, + internal_ip: Optional[str] = None, ) -> None: replica = models.Replica( id=replica_id, @@ -147,6 +148,7 @@ async def register_replica( ssh_proxy=ssh_proxy, ssh_head_proxy=ssh_head_proxy, ssh_head_proxy_private_key=ssh_head_proxy_private_key, + internal_ip=internal_ip, ) async with lock: @@ -237,6 +239,11 @@ async def register_model_entrypoint( logger.info("Entrypoint %s is now registered in project %s", domain, project_name) +def _uses_pd_disaggregation(service: models.Service) -> bool: + """PD disaggregation: router talks to replicas via internal_ip, no SSH tunnels needed.""" + return service.router is not None and service.router.pd_disaggregation + + async def apply_service( service: models.Service, old_service: Optional[models.Service], @@ -256,13 +263,31 @@ async def apply_service( ), service_conn_pool=service_conn_pool, ) - replica_conns, replica_failures = await get_or_add_replica_connections( - service, repo, service_conn_pool - ) - replica_configs = [ - ReplicaConfig(id=replica.id, socket=conn.app_socket_path) - for replica, conn in replica_conns.items() - ] + if _uses_pd_disaggregation(service): + replica_conns = {} + replica_failures = {} + replica_configs = [ + ReplicaConfig( + id=replica.id, + socket=Path("/dev/null"), + port=replica.app_port, + internal_ip=replica.internal_ip, + ) + for replica in service.replicas + ] + else: + replica_conns, replica_failures = await get_or_add_replica_connections( + service, repo, service_conn_pool + ) + replica_configs = [ + ReplicaConfig( + id=replica.id, + socket=conn.app_socket_path, + port=replica.app_port, + internal_ip=replica.internal_ip, + ) + for replica, conn in replica_conns.items() + ] service_config = await get_nginx_service_config(service, replica_configs) await nginx.register(service_config, (await repo.get_config()).acme_settings) return replica_failures diff --git a/src/dstack/_internal/proxy/lib/models.py b/src/dstack/_internal/proxy/lib/models.py index f304bbc394..a0a724dbea 100644 --- a/src/dstack/_internal/proxy/lib/models.py +++ b/src/dstack/_internal/proxy/lib/models.py @@ -7,7 +7,7 @@ from typing_extensions import Annotated from dstack._internal.core.models.instances import SSHConnectionParams -from dstack._internal.core.models.routers import AnyRouterConfig +from dstack._internal.core.models.routers import AnyServiceRouterConfig from dstack._internal.proxy.lib.errors import UnexpectedProxyError @@ -27,6 +27,7 @@ class Replica(ImmutableModel): # Optional outer proxy, a head node/bastion ssh_head_proxy: Optional[SSHConnectionParams] = None ssh_head_proxy_private_key: Optional[str] = None + internal_ip: Optional[str] = None class IPAddressPartitioningKey(ImmutableModel): @@ -58,7 +59,7 @@ class Service(ImmutableModel): client_max_body_size: int # only enforced on gateways strip_prefix: bool = True # only used in-server replicas: tuple[Replica, ...] - router: Optional[AnyRouterConfig] = None + router: Optional[AnyServiceRouterConfig] = None cors_enabled: bool = False # only used on gateways; enabled for openai-format models @property diff --git a/src/dstack/_internal/server/services/gateways/client.py b/src/dstack/_internal/server/services/gateways/client.py index d4f1c831e8..9bc7a1f903 100644 --- a/src/dstack/_internal/server/services/gateways/client.py +++ b/src/dstack/_internal/server/services/gateways/client.py @@ -9,7 +9,7 @@ from dstack._internal.core.errors import GatewayError from dstack._internal.core.models.configurations import RateLimit from dstack._internal.core.models.instances import SSHConnectionParams -from dstack._internal.core.models.routers import AnyRouterConfig +from dstack._internal.core.models.routers import AnyServiceRouterConfig from dstack._internal.core.models.runs import JobSpec, JobSubmission, Run, get_service_port from dstack._internal.proxy.gateway.schemas.stats import ServiceStats from dstack._internal.server import settings @@ -46,7 +46,7 @@ async def register_service( options: dict, rate_limits: list[RateLimit], ssh_private_key: str, - router: Optional[AnyRouterConfig] = None, + router: Optional[AnyServiceRouterConfig] = None, ): if "openai" in options: entrypoint = f"gateway.{domain.split('.', maxsplit=1)[1]}" @@ -99,6 +99,7 @@ async def register_replica( assert jpd is not None assert jpd.hostname is not None assert jpd.ssh_port is not None + payload["internal_ip"] = jpd.internal_ip if not jpd.dockerized: payload.update( { diff --git a/src/dstack/_internal/server/services/proxy/repo.py b/src/dstack/_internal/server/services/proxy/repo.py index 7f1564fe62..385c9e654f 100644 --- a/src/dstack/_internal/server/services/proxy/repo.py +++ b/src/dstack/_internal/server/services/proxy/repo.py @@ -111,6 +111,7 @@ async def get_service(self, project_name: str, run_name: str) -> Optional[Servic ssh_proxy=ssh_proxy, ssh_head_proxy=ssh_head_proxy, ssh_head_proxy_private_key=ssh_head_proxy_private_key, + internal_ip=jpd.internal_ip, ) replicas.append(replica) return Service( diff --git a/src/dstack/_internal/server/services/services/__init__.py b/src/dstack/_internal/server/services/services/__init__.py index 511cf7cc93..b701b822b0 100644 --- a/src/dstack/_internal/server/services/services/__init__.py +++ b/src/dstack/_internal/server/services/services/__init__.py @@ -26,6 +26,11 @@ ) from dstack._internal.core.models.gateways import GatewayConfiguration, GatewayStatus from dstack._internal.core.models.instances import SSHConnectionParams +from dstack._internal.core.models.routers import ( + AnyServiceRouterConfig, + RouterType, + SGLangServiceRouterConfig, +) from dstack._internal.core.models.runs import JobSpec, Run, RunSpec, ServiceModelSpec, ServiceSpec from dstack._internal.core.models.services import OpenAIChatModel from dstack._internal.server import settings @@ -45,6 +50,41 @@ logger = get_logger(__name__) +def _gateway_has_sglang_router(config: GatewayConfiguration) -> bool: + return config.router is not None and config.router.type == RouterType.SGLANG.value + + +def _build_service_router_config( + gateway_configuration: GatewayConfiguration, + service_configuration: ServiceConfiguration, +) -> Optional[AnyServiceRouterConfig]: + """ + Build router config from gateway (type, policy) + service (pd_disaggregation, policy override). + Service's policy overrides gateway's if present. Keeps backward compat: SGLang enabled + automatically when gateway has it configured. + """ + if not _gateway_has_sglang_router(gateway_configuration): + return None + + gateway_router = gateway_configuration.router + assert gateway_router is not None # ensured by _gateway_has_sglang_router + router_type = gateway_router.type + policy = gateway_router.policy + + service_router = service_configuration.router + if service_router is not None and isinstance(service_router, SGLangServiceRouterConfig): + policy = service_router.policy + pd_disaggregation = service_router.pd_disaggregation + else: + pd_disaggregation = False + + return SGLangServiceRouterConfig( + type=router_type, + policy=policy, + pd_disaggregation=pd_disaggregation, + ) + + async def register_service(session: AsyncSession, run_model: RunModel, run_spec: RunSpec): assert isinstance(run_spec.configuration, ServiceConfiguration) @@ -92,8 +132,20 @@ async def _register_service_in_gateway( raise ServerClientError("Gateway status is not running") gateway_configuration = get_gateway_configuration(gateway) + + # Check: service specifies SGLang router but gateway does not have it + service_router = run_spec.configuration.router + service_wants_sglang = service_router is not None and isinstance( + service_router, SGLangServiceRouterConfig + ) + if service_wants_sglang and not _gateway_has_sglang_router(gateway_configuration): + raise ServerClientError( + "Service requires gateway with SGLang router but gateway " + f"'{gateway.name}' does not have the SGLang router configured." + ) + service_https = _get_service_https(run_spec, gateway_configuration) - router = gateway_configuration.router + router = _build_service_router_config(gateway_configuration, run_spec.configuration) service_protocol = "https" if service_https else "http" if service_https and gateway_configuration.certificate is None: @@ -158,6 +210,14 @@ async def _register_service_in_gateway( def _register_service_in_server(run_model: RunModel, run_spec: RunSpec) -> ServiceSpec: assert run_spec.configuration.type == "service" + if ( + run_spec.configuration.router is not None + and run_spec.configuration.router.type == RouterType.SGLANG + ): + raise ServerClientError( + "Service with SGLang router configuration requires a gateway. " + "Please configure a gateway with the SGLang router enabled." + ) if run_spec.configuration.https != SERVICE_HTTPS_DEFAULT: # Note: if the user sets `https: `, it will be ignored silently # TODO: in 0.19, make `https` Optional to be able to tell if it was set or omitted From f7dacf7f37c9f53b490d36f64e231470928f4aa1 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Wed, 18 Feb 2026 16:02:25 +0500 Subject: [PATCH 146/187] Group db migrations (#3583) --- src/dstack/_internal/server/alembic.ini | 6 +++--- src/dstack/_internal/server/db.py | 1 + src/dstack/_internal/server/migrations/script.py.mako | 2 +- .../{a060e2440936_.py => 2023/09_20_1634_a060e2440936_.py} | 0 .../{bfba43f6def2_.py => 2023/09_22_1052_bfba43f6def2_.py} | 0 .../{252d3743b641_.py => 2023/09_25_1609_252d3743b641_.py} | 0 .../09_27_1742_fe72c4de8376_add_gateways.py} | 0 ...1_01_1019_d0bb68e48b9f_add_project_owners_and_quotas.py} | 0 .../11_01_1135_112753bc17dd_remove_nullable_fields.py} | 0 .../11_03_1646_14f2cb002fc2_add_jobmodel_removed_flag.py} | 0 .../11_06_1613_23e01c56279a_make_blob_nullable.py} | 0 .../11_14_1041_3dbdce90d0e0_fix_code_uq_constraint.py} | 0 .../11_14_1609_686fb8341ea5_add_user_emails.py} | 0 ..._11_1034_e6391ca6a264_separate_gateways_from_compute.py} | 0 ...19_1555_48ad3ecbaea2_do_not_delete_projects_and_runs.py} | 0 ...01_09_1223_d3e8af4786fa_gateway_compute_flag_deleted.py} | 0 .../02_12_1427_27d3e55759fa_add_pools.py} | 0 .../{29c08c6a8cb3_.py => 2024/02_14_1139_29c08c6a8cb3_.py} | 0 ...139_9eea6af28e10_added_fail_reason_for_instancemodel.py} | 0 .../02_21_1011_1a48dfe44a40_rework_termination_handling.py} | 0 ...547_ed0ca30e13bb_migrate_instancestatus_provisioning.py} | 0 ...02_28_0615_b88d55c2a07d_replace_instancestatus_ready.py} | 0 .../03_01_1430_4b4319398164_introduce_runs_processing.py} | 0 .../03_07_1721_0e33559e16ed_update_instancestatus.py} | 0 ..._555138b1f77f_change_instancemodel_for_asynchronous_.py} | 0 .../03_13_1048_5ec538b70e71_replace_instansestatus.py} | 0 .../03_18_1216_4ae1a5b0e7f1_add_run_list_index.py} | 0 ..._29_0637_99b4c8c954ea_add_termination_reason_message.py} | 0 ...02_0142_866ec1d67184_replace_retrypolicy_limit_with_.py} | 0 ...4_08_0802_1e3fb39ef74b_add_remote_connection_details.py} | 0 ..._15_1104_58aa5162dcc3_add_gatewaymodel_configuration.py} | 0 ...8_c154eece89da_add_fields_for_async_gateway_creation.py} | 0 ...1338_dfffd6a1165c_add_fields_for_gateways_behind_alb.py} | 0 ..._1040_29826f417010_remove_instancemodel_retry_policy.py} | 0 ...5_30_0955_b4d6ad60db08_add_instancemodel_unreachable.py} | 0 .../06_26_1122_98cd9c8b5927_add_volumemodel.py} | 0 .../07_04_1726_5ad8debc8fe6_fixes_for_psql.py} | 0 .../07_14_2143_91ac5e543037_extend_repos_creds_column.py} | 0 .../07_15_2309_3cf77fb8bcf1_store_repo_clone_url.py} | 0 .../07_17_1543_c00090eaef21_support_fleets.py} | 0 .../08_15_1024_710e5b3fac8f_add_encryption.py} | 0 .../08_16_1425_54a77e19c64c_add_manager_project_role.py} | 0 .../08_19_1510_d6b11105f659_add_usermodel_active.py} | 0 .../08_21_1420_ea60480f82bb_add_membermodel_member_num.py} | 0 ...342_7b24b1c8eba7_add_instancemodel_last_processed_at.py} | 0 .../09_10_1107_c83d45f9a971_replace_string_with_text.py} | 0 ..._e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py} | 0 .../09_25_1352_a7b46c073fa1_add_placementgroupmodel.py} | 0 .../10_14_1126_c20626d03cfb_add_jobmetricspoint.py} | 0 ...6_1431_afbc600ff2b2_add_created_at_to_usermodel_and_.py} | 0 .../{82b32a135ea2_.py => 2024/11_04_1546_82b32a135ea2_.py} | 0 .../11_14_1031_91a12fff6c76_add_repocredsmodel.py} | 0 ...2_24_1256_065588ec72b8_add_vultr_to_backendtype_enum.py} | 0 ...1_10_1417_803c7e9ed85d_add_jobmodel_job_runtime_data.py} | 0 ...4_1333_c48df7985d57_add_instance_termination_retries.py} | 0 ..._1459_1338b788b612_reverse_job_instance_relationship.py} | 0 ..._ffa99edd1988_add_jobterminationreason_max_duration_.py} | 0 ...9_1152_da574e93fee0_add_jobmodel_volumes_detached_at.py} | 0 ...04_1110_51d45659d574_add_instancemodel_blocks_fields.py} | 0 ...30_63c3f19cb184_add_jobterminationreason_inactivity_.py} | 0 ...02_11_2337_1e76fb0dde87_add_jobmodel_inactivity_secs.py} | 0 ...2_1319_a751ef183f27_move_attachment_data_to_volumes_.py} | 0 .../02_21_1059_60e444118b6d_add_jobprometheusmetrics.py} | 0 ...8d1b92988bc_add_jobterminationreason_terminated_due_.py} | 0 .../03_10_1449_bc8ca4a505c6_store_backendtype_as_string.py} | 0 ...113_7bc2586e8b9e_make_instancemodel_pool_id_optional.py} | 0 ..._1800_7ba3b59d7ca6_add_runmodel_resubmission_attempt.py} | 0 .../05_09_1025_6c1a9d6530ee_add_jobmodel_exit_status.py} | 0 ...05_13_1624_20166748b60c_add_jobmodel_disconnected_at.py} | 0 .../05_14_1524_bca2fdf130bf_add_runmodel_priority.py} | 0 ...5_29_1530_35e90e1b0d3e_add_rolling_deployment_fields.py} | 0 .../06_06_1304_35f732ee4cf5_add_projectmodel_is_public.py} | 0 .../06_12_1228_5f1707c525d2_add_filearchivemodel.py} | 0 .../06_30_1100_644b8a114187_add_secretmodel.py} | 0 ...6_d5863798bf41_add_volumemodel_last_job_processed_at.py} | 0 ..._17_1547_ec02a26a256c_add_runmodel_next_triggered_at.py} | 0 .../07_25_1036_50dd7ea98639_index_status_columns.py} | 0 .../08_01_1456_728b1488b1b4_add_instance_health.py} | 0 .../08_03_1951_25479f540245_add_probes.py} | 0 .../08_06_1349_74a1f55209bd_store_enums_as_strings.py} | 0 .../08_11_1323_3d7f6c2ec000_add_jobmodel_registered.py} | 0 .../08_15_1126_e2d08cd1b8d9_add_jobmodel_fleet.py} | 0 ...8_2498ab323443_add_fleetmodel_consolidation_attempt_.py} | 0 .../10_09_2031_ff1d94f65b08_user_ssh_key.py} | 0 .../10_21_1601_7d1ec2b920ac_add_computegroupmodel.py} | 0 ...06e977bc61c7_add_usermodel_deleted_and_original_name.py} | 0 .../11_27_1511_006512f572b4_add_projects_original_name.py} | 0 .../12_04_2048_d4d9dc26cf58_add_ix_jobs_run_id.py} | 0 .../12_04_2052_5fd659afca82_add_ix_instances_fleet_id.py} | 0 ...12_04_2056_22d74df9897e_add_events_and_event_targets.py} | 0 ...054_706e0acc3a7d_add_runmodel_desired_replica_counts.py} | 0 .../12_21_2208_1aa9638ad963_added_email_index.py} | 0 ...03c91e24634_add_instances_termination_reason_message.py} | 0 ..._57cff3ec86ce_add_computegroupmodel_pipeline_columns.py} | 0 ...c2a227b0154_add_placementgroupmodel_pipeline_columns.py} | 0 ...2_a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py} | 0 96 files changed, 5 insertions(+), 4 deletions(-) rename src/dstack/_internal/server/migrations/versions/{a060e2440936_.py => 2023/09_20_1634_a060e2440936_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{bfba43f6def2_.py => 2023/09_22_1052_bfba43f6def2_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{252d3743b641_.py => 2023/09_25_1609_252d3743b641_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{fe72c4de8376_add_gateways.py => 2023/09_27_1742_fe72c4de8376_add_gateways.py} (100%) rename src/dstack/_internal/server/migrations/versions/{d0bb68e48b9f_add_project_owners_and_quotas.py => 2023/11_01_1019_d0bb68e48b9f_add_project_owners_and_quotas.py} (100%) rename src/dstack/_internal/server/migrations/versions/{112753bc17dd_remove_nullable_fields.py => 2023/11_01_1135_112753bc17dd_remove_nullable_fields.py} (100%) rename src/dstack/_internal/server/migrations/versions/{14f2cb002fc2_add_jobmodel_removed_flag.py => 2023/11_03_1646_14f2cb002fc2_add_jobmodel_removed_flag.py} (100%) rename src/dstack/_internal/server/migrations/versions/{23e01c56279a_make_blob_nullable.py => 2023/11_06_1613_23e01c56279a_make_blob_nullable.py} (100%) rename src/dstack/_internal/server/migrations/versions/{3dbdce90d0e0_fix_code_uq_constraint.py => 2023/11_14_1041_3dbdce90d0e0_fix_code_uq_constraint.py} (100%) rename src/dstack/_internal/server/migrations/versions/{686fb8341ea5_add_user_emails.py => 2023/11_14_1609_686fb8341ea5_add_user_emails.py} (100%) rename src/dstack/_internal/server/migrations/versions/{e6391ca6a264_separate_gateways_from_compute.py => 2023/12_11_1034_e6391ca6a264_separate_gateways_from_compute.py} (100%) rename src/dstack/_internal/server/migrations/versions/{48ad3ecbaea2_do_not_delete_projects_and_runs.py => 2023/12_19_1555_48ad3ecbaea2_do_not_delete_projects_and_runs.py} (100%) rename src/dstack/_internal/server/migrations/versions/{d3e8af4786fa_gateway_compute_flag_deleted.py => 2024/01_09_1223_d3e8af4786fa_gateway_compute_flag_deleted.py} (100%) rename src/dstack/_internal/server/migrations/versions/{27d3e55759fa_add_pools.py => 2024/02_12_1427_27d3e55759fa_add_pools.py} (100%) rename src/dstack/_internal/server/migrations/versions/{29c08c6a8cb3_.py => 2024/02_14_1139_29c08c6a8cb3_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{9eea6af28e10_added_fail_reason_for_instancemodel.py => 2024/02_19_1139_9eea6af28e10_added_fail_reason_for_instancemodel.py} (100%) rename src/dstack/_internal/server/migrations/versions/{1a48dfe44a40_rework_termination_handling.py => 2024/02_21_1011_1a48dfe44a40_rework_termination_handling.py} (100%) rename src/dstack/_internal/server/migrations/versions/{ed0ca30e13bb_migrate_instancestatus_provisioning.py => 2024/02_28_0547_ed0ca30e13bb_migrate_instancestatus_provisioning.py} (100%) rename src/dstack/_internal/server/migrations/versions/{b88d55c2a07d_replace_instancestatus_ready.py => 2024/02_28_0615_b88d55c2a07d_replace_instancestatus_ready.py} (100%) rename src/dstack/_internal/server/migrations/versions/{4b4319398164_introduce_runs_processing.py => 2024/03_01_1430_4b4319398164_introduce_runs_processing.py} (100%) rename src/dstack/_internal/server/migrations/versions/{0e33559e16ed_update_instancestatus.py => 2024/03_07_1721_0e33559e16ed_update_instancestatus.py} (100%) rename src/dstack/_internal/server/migrations/versions/{555138b1f77f_change_instancemodel_for_asynchronous_.py => 2024/03_12_1717_555138b1f77f_change_instancemodel_for_asynchronous_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{5ec538b70e71_replace_instansestatus.py => 2024/03_13_1048_5ec538b70e71_replace_instansestatus.py} (100%) rename src/dstack/_internal/server/migrations/versions/{4ae1a5b0e7f1_add_run_list_index.py => 2024/03_18_1216_4ae1a5b0e7f1_add_run_list_index.py} (100%) rename src/dstack/_internal/server/migrations/versions/{99b4c8c954ea_add_termination_reason_message.py => 2024/03_29_0637_99b4c8c954ea_add_termination_reason_message.py} (100%) rename src/dstack/_internal/server/migrations/versions/{866ec1d67184_replace_retrypolicy_limit_with_.py => 2024/04_02_0142_866ec1d67184_replace_retrypolicy_limit_with_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{1e3fb39ef74b_add_remote_connection_details.py => 2024/04_08_0802_1e3fb39ef74b_add_remote_connection_details.py} (100%) rename src/dstack/_internal/server/migrations/versions/{58aa5162dcc3_add_gatewaymodel_configuration.py => 2024/05_15_1104_58aa5162dcc3_add_gatewaymodel_configuration.py} (100%) rename src/dstack/_internal/server/migrations/versions/{c154eece89da_add_fields_for_async_gateway_creation.py => 2024/05_16_1418_c154eece89da_add_fields_for_async_gateway_creation.py} (100%) rename src/dstack/_internal/server/migrations/versions/{dfffd6a1165c_add_fields_for_gateways_behind_alb.py => 2024/05_22_1338_dfffd6a1165c_add_fields_for_gateways_behind_alb.py} (100%) rename src/dstack/_internal/server/migrations/versions/{29826f417010_remove_instancemodel_retry_policy.py => 2024/05_29_1040_29826f417010_remove_instancemodel_retry_policy.py} (100%) rename src/dstack/_internal/server/migrations/versions/{b4d6ad60db08_add_instancemodel_unreachable.py => 2024/05_30_0955_b4d6ad60db08_add_instancemodel_unreachable.py} (100%) rename src/dstack/_internal/server/migrations/versions/{98cd9c8b5927_add_volumemodel.py => 2024/06_26_1122_98cd9c8b5927_add_volumemodel.py} (100%) rename src/dstack/_internal/server/migrations/versions/{5ad8debc8fe6_fixes_for_psql.py => 2024/07_04_1726_5ad8debc8fe6_fixes_for_psql.py} (100%) rename src/dstack/_internal/server/migrations/versions/{91ac5e543037_extend_repos_creds_column.py => 2024/07_14_2143_91ac5e543037_extend_repos_creds_column.py} (100%) rename src/dstack/_internal/server/migrations/versions/{3cf77fb8bcf1_store_repo_clone_url.py => 2024/07_15_2309_3cf77fb8bcf1_store_repo_clone_url.py} (100%) rename src/dstack/_internal/server/migrations/versions/{c00090eaef21_support_fleets.py => 2024/07_17_1543_c00090eaef21_support_fleets.py} (100%) rename src/dstack/_internal/server/migrations/versions/{710e5b3fac8f_add_encryption.py => 2024/08_15_1024_710e5b3fac8f_add_encryption.py} (100%) rename src/dstack/_internal/server/migrations/versions/{54a77e19c64c_add_manager_project_role.py => 2024/08_16_1425_54a77e19c64c_add_manager_project_role.py} (100%) rename src/dstack/_internal/server/migrations/versions/{d6b11105f659_add_usermodel_active.py => 2024/08_19_1510_d6b11105f659_add_usermodel_active.py} (100%) rename src/dstack/_internal/server/migrations/versions/{ea60480f82bb_add_membermodel_member_num.py => 2024/08_21_1420_ea60480f82bb_add_membermodel_member_num.py} (100%) rename src/dstack/_internal/server/migrations/versions/{7b24b1c8eba7_add_instancemodel_last_processed_at.py => 2024/08_30_1342_7b24b1c8eba7_add_instancemodel_last_processed_at.py} (100%) rename src/dstack/_internal/server/migrations/versions/{c83d45f9a971_replace_string_with_text.py => 2024/09_10_1107_c83d45f9a971_replace_string_with_text.py} (100%) rename src/dstack/_internal/server/migrations/versions/{e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py => 2024/09_17_1223_e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py} (100%) rename src/dstack/_internal/server/migrations/versions/{a7b46c073fa1_add_placementgroupmodel.py => 2024/09_25_1352_a7b46c073fa1_add_placementgroupmodel.py} (100%) rename src/dstack/_internal/server/migrations/versions/{c20626d03cfb_add_jobmetricspoint.py => 2024/10_14_1126_c20626d03cfb_add_jobmetricspoint.py} (100%) rename src/dstack/_internal/server/migrations/versions/{afbc600ff2b2_add_created_at_to_usermodel_and_.py => 2024/10_16_1431_afbc600ff2b2_add_created_at_to_usermodel_and_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{82b32a135ea2_.py => 2024/11_04_1546_82b32a135ea2_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{91a12fff6c76_add_repocredsmodel.py => 2024/11_14_1031_91a12fff6c76_add_repocredsmodel.py} (100%) rename src/dstack/_internal/server/migrations/versions/{065588ec72b8_add_vultr_to_backendtype_enum.py => 2024/12_24_1256_065588ec72b8_add_vultr_to_backendtype_enum.py} (100%) rename src/dstack/_internal/server/migrations/versions/{803c7e9ed85d_add_jobmodel_job_runtime_data.py => 2025/01_10_1417_803c7e9ed85d_add_jobmodel_job_runtime_data.py} (100%) rename src/dstack/_internal/server/migrations/versions/{c48df7985d57_add_instance_termination_retries.py => 2025/01_14_1333_c48df7985d57_add_instance_termination_retries.py} (100%) rename src/dstack/_internal/server/migrations/versions/{1338b788b612_reverse_job_instance_relationship.py => 2025/01_16_1459_1338b788b612_reverse_job_instance_relationship.py} (100%) rename src/dstack/_internal/server/migrations/versions/{ffa99edd1988_add_jobterminationreason_max_duration_.py => 2025/01_21_1053_ffa99edd1988_add_jobterminationreason_max_duration_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{da574e93fee0_add_jobmodel_volumes_detached_at.py => 2025/01_29_1152_da574e93fee0_add_jobmodel_volumes_detached_at.py} (100%) rename src/dstack/_internal/server/migrations/versions/{51d45659d574_add_instancemodel_blocks_fields.py => 2025/02_04_1110_51d45659d574_add_instancemodel_blocks_fields.py} (100%) rename src/dstack/_internal/server/migrations/versions/{63c3f19cb184_add_jobterminationreason_inactivity_.py => 2025/02_11_2230_63c3f19cb184_add_jobterminationreason_inactivity_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{1e76fb0dde87_add_jobmodel_inactivity_secs.py => 2025/02_11_2337_1e76fb0dde87_add_jobmodel_inactivity_secs.py} (100%) rename src/dstack/_internal/server/migrations/versions/{a751ef183f27_move_attachment_data_to_volumes_.py => 2025/02_12_1319_a751ef183f27_move_attachment_data_to_volumes_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{60e444118b6d_add_jobprometheusmetrics.py => 2025/02_21_1059_60e444118b6d_add_jobprometheusmetrics.py} (100%) rename src/dstack/_internal/server/migrations/versions/{98d1b92988bc_add_jobterminationreason_terminated_due_.py => 2025/02_28_1512_98d1b92988bc_add_jobterminationreason_terminated_due_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{bc8ca4a505c6_store_backendtype_as_string.py => 2025/03_10_1449_bc8ca4a505c6_store_backendtype_as_string.py} (100%) rename src/dstack/_internal/server/migrations/versions/{7bc2586e8b9e_make_instancemodel_pool_id_optional.py => 2025/03_13_1113_7bc2586e8b9e_make_instancemodel_pool_id_optional.py} (100%) rename src/dstack/_internal/server/migrations/versions/{7ba3b59d7ca6_add_runmodel_resubmission_attempt.py => 2025/04_15_1800_7ba3b59d7ca6_add_runmodel_resubmission_attempt.py} (100%) rename src/dstack/_internal/server/migrations/versions/{6c1a9d6530ee_add_jobmodel_exit_status.py => 2025/05_09_1025_6c1a9d6530ee_add_jobmodel_exit_status.py} (100%) rename src/dstack/_internal/server/migrations/versions/{20166748b60c_add_jobmodel_disconnected_at.py => 2025/05_13_1624_20166748b60c_add_jobmodel_disconnected_at.py} (100%) rename src/dstack/_internal/server/migrations/versions/{bca2fdf130bf_add_runmodel_priority.py => 2025/05_14_1524_bca2fdf130bf_add_runmodel_priority.py} (100%) rename src/dstack/_internal/server/migrations/versions/{35e90e1b0d3e_add_rolling_deployment_fields.py => 2025/05_29_1530_35e90e1b0d3e_add_rolling_deployment_fields.py} (100%) rename src/dstack/_internal/server/migrations/versions/{35f732ee4cf5_add_projectmodel_is_public.py => 2025/06_06_1304_35f732ee4cf5_add_projectmodel_is_public.py} (100%) rename src/dstack/_internal/server/migrations/versions/{5f1707c525d2_add_filearchivemodel.py => 2025/06_12_1228_5f1707c525d2_add_filearchivemodel.py} (100%) rename src/dstack/_internal/server/migrations/versions/{644b8a114187_add_secretmodel.py => 2025/06_30_1100_644b8a114187_add_secretmodel.py} (100%) rename src/dstack/_internal/server/migrations/versions/{d5863798bf41_add_volumemodel_last_job_processed_at.py => 2025/07_15_1426_d5863798bf41_add_volumemodel_last_job_processed_at.py} (100%) rename src/dstack/_internal/server/migrations/versions/{ec02a26a256c_add_runmodel_next_triggered_at.py => 2025/07_17_1547_ec02a26a256c_add_runmodel_next_triggered_at.py} (100%) rename src/dstack/_internal/server/migrations/versions/{50dd7ea98639_index_status_columns.py => 2025/07_25_1036_50dd7ea98639_index_status_columns.py} (100%) rename src/dstack/_internal/server/migrations/versions/{728b1488b1b4_add_instance_health.py => 2025/08_01_1456_728b1488b1b4_add_instance_health.py} (100%) rename src/dstack/_internal/server/migrations/versions/{25479f540245_add_probes.py => 2025/08_03_1951_25479f540245_add_probes.py} (100%) rename src/dstack/_internal/server/migrations/versions/{74a1f55209bd_store_enums_as_strings.py => 2025/08_06_1349_74a1f55209bd_store_enums_as_strings.py} (100%) rename src/dstack/_internal/server/migrations/versions/{3d7f6c2ec000_add_jobmodel_registered.py => 2025/08_11_1323_3d7f6c2ec000_add_jobmodel_registered.py} (100%) rename src/dstack/_internal/server/migrations/versions/{e2d08cd1b8d9_add_jobmodel_fleet.py => 2025/08_15_1126_e2d08cd1b8d9_add_jobmodel_fleet.py} (100%) rename src/dstack/_internal/server/migrations/versions/{2498ab323443_add_fleetmodel_consolidation_attempt_.py => 2025/08_29_1608_2498ab323443_add_fleetmodel_consolidation_attempt_.py} (100%) rename src/dstack/_internal/server/migrations/versions/{ff1d94f65b08_user_ssh_key.py => 2025/10_09_2031_ff1d94f65b08_user_ssh_key.py} (100%) rename src/dstack/_internal/server/migrations/versions/{7d1ec2b920ac_add_computegroupmodel.py => 2025/10_21_1601_7d1ec2b920ac_add_computegroupmodel.py} (100%) rename src/dstack/_internal/server/migrations/versions/{06e977bc61c7_add_usermodel_deleted_and_original_name.py => 2025/11_26_1143_06e977bc61c7_add_usermodel_deleted_and_original_name.py} (100%) rename src/dstack/_internal/server/migrations/versions/{006512f572b4_add_projects_original_name.py => 2025/11_27_1511_006512f572b4_add_projects_original_name.py} (100%) rename src/dstack/_internal/server/migrations/versions/{d4d9dc26cf58_add_ix_jobs_run_id.py => 2025/12_04_2048_d4d9dc26cf58_add_ix_jobs_run_id.py} (100%) rename src/dstack/_internal/server/migrations/versions/{5fd659afca82_add_ix_instances_fleet_id.py => 2025/12_04_2052_5fd659afca82_add_ix_instances_fleet_id.py} (100%) rename src/dstack/_internal/server/migrations/versions/{22d74df9897e_add_events_and_event_targets.py => 2025/12_04_2056_22d74df9897e_add_events_and_event_targets.py} (100%) rename src/dstack/_internal/server/migrations/versions/{706e0acc3a7d_add_runmodel_desired_replica_counts.py => 2025/12_18_1054_706e0acc3a7d_add_runmodel_desired_replica_counts.py} (100%) rename src/dstack/_internal/server/migrations/versions/{1aa9638ad963_added_email_index.py => 2025/12_21_2208_1aa9638ad963_added_email_index.py} (100%) rename src/dstack/_internal/server/migrations/versions/{903c91e24634_add_instances_termination_reason_message.py => 2025/12_22_1217_903c91e24634_add_instances_termination_reason_message.py} (100%) rename src/dstack/_internal/server/migrations/versions/{57cff3ec86ce_add_computegroupmodel_pipeline_columns.py => 2026/02_18_1107_57cff3ec86ce_add_computegroupmodel_pipeline_columns.py} (100%) rename src/dstack/_internal/server/migrations/versions/{9c2a227b0154_add_placementgroupmodel_pipeline_columns.py => 2026/02_18_1108_9c2a227b0154_add_placementgroupmodel_pipeline_columns.py} (100%) rename src/dstack/_internal/server/migrations/versions/{a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py => 2026/02_18_1122_a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py} (100%) diff --git a/src/dstack/_internal/server/alembic.ini b/src/dstack/_internal/server/alembic.ini index 5e498c368c..c4c6840f01 100644 --- a/src/dstack/_internal/server/alembic.ini +++ b/src/dstack/_internal/server/alembic.ini @@ -8,7 +8,7 @@ script_location = migrations # Uncomment the line below if you want the files to be prepended with date and time # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file # for all available tokens -# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s +file_template = %%(year)d/%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d_%%(rev)s_%%(slug)s # sys.path path, will be prepended to sys.path if present. # defaults to the current working directory. @@ -20,7 +20,7 @@ prepend_sys_path = . # installed by adding `alembic[tz]` to the pip requirements # string value is passed to dateutil.tz.gettz() # leave blank for localtime -# timezone = +timezone = utc # max length of characters to apply to the # "slug" field @@ -50,7 +50,7 @@ path_separator = os # set to 'true' to search source files recursively # in each "version_locations" directory # new in Alembic version 1.10 -# recursive_version_locations = false +recursive_version_locations = true # the output encoding used when revision files # are written from script.py.mako diff --git a/src/dstack/_internal/server/db.py b/src/dstack/_internal/server/db.py index 5f43f52e0a..2eb18a3f3c 100644 --- a/src/dstack/_internal/server/db.py +++ b/src/dstack/_internal/server/db.py @@ -125,5 +125,6 @@ async def sqlite_commit(session: AsyncSession): def _run_alembic_upgrade(connection): alembic_cfg = config.Config() alembic_cfg.set_main_option("script_location", settings.ALEMBIC_MIGRATIONS_LOCATION) + alembic_cfg.set_main_option("recursive_version_locations", "true") alembic_cfg.attributes["connection"] = connection command.upgrade(alembic_cfg, "head") diff --git a/src/dstack/_internal/server/migrations/script.py.mako b/src/dstack/_internal/server/migrations/script.py.mako index e5ecbc9a75..79c96c3a3d 100644 --- a/src/dstack/_internal/server/migrations/script.py.mako +++ b/src/dstack/_internal/server/migrations/script.py.mako @@ -6,9 +6,9 @@ Create Date: ${create_date} """ -from alembic import op import sqlalchemy as sa import sqlalchemy_utils +from alembic import op import dstack._internal.server.models ${imports if imports else ""} diff --git a/src/dstack/_internal/server/migrations/versions/a060e2440936_.py b/src/dstack/_internal/server/migrations/versions/2023/09_20_1634_a060e2440936_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/a060e2440936_.py rename to src/dstack/_internal/server/migrations/versions/2023/09_20_1634_a060e2440936_.py diff --git a/src/dstack/_internal/server/migrations/versions/bfba43f6def2_.py b/src/dstack/_internal/server/migrations/versions/2023/09_22_1052_bfba43f6def2_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/bfba43f6def2_.py rename to src/dstack/_internal/server/migrations/versions/2023/09_22_1052_bfba43f6def2_.py diff --git a/src/dstack/_internal/server/migrations/versions/252d3743b641_.py b/src/dstack/_internal/server/migrations/versions/2023/09_25_1609_252d3743b641_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/252d3743b641_.py rename to src/dstack/_internal/server/migrations/versions/2023/09_25_1609_252d3743b641_.py diff --git a/src/dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py b/src/dstack/_internal/server/migrations/versions/2023/09_27_1742_fe72c4de8376_add_gateways.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/fe72c4de8376_add_gateways.py rename to src/dstack/_internal/server/migrations/versions/2023/09_27_1742_fe72c4de8376_add_gateways.py diff --git a/src/dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py b/src/dstack/_internal/server/migrations/versions/2023/11_01_1019_d0bb68e48b9f_add_project_owners_and_quotas.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/d0bb68e48b9f_add_project_owners_and_quotas.py rename to src/dstack/_internal/server/migrations/versions/2023/11_01_1019_d0bb68e48b9f_add_project_owners_and_quotas.py diff --git a/src/dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py b/src/dstack/_internal/server/migrations/versions/2023/11_01_1135_112753bc17dd_remove_nullable_fields.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/112753bc17dd_remove_nullable_fields.py rename to src/dstack/_internal/server/migrations/versions/2023/11_01_1135_112753bc17dd_remove_nullable_fields.py diff --git a/src/dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py b/src/dstack/_internal/server/migrations/versions/2023/11_03_1646_14f2cb002fc2_add_jobmodel_removed_flag.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/14f2cb002fc2_add_jobmodel_removed_flag.py rename to src/dstack/_internal/server/migrations/versions/2023/11_03_1646_14f2cb002fc2_add_jobmodel_removed_flag.py diff --git a/src/dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py b/src/dstack/_internal/server/migrations/versions/2023/11_06_1613_23e01c56279a_make_blob_nullable.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/23e01c56279a_make_blob_nullable.py rename to src/dstack/_internal/server/migrations/versions/2023/11_06_1613_23e01c56279a_make_blob_nullable.py diff --git a/src/dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py b/src/dstack/_internal/server/migrations/versions/2023/11_14_1041_3dbdce90d0e0_fix_code_uq_constraint.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/3dbdce90d0e0_fix_code_uq_constraint.py rename to src/dstack/_internal/server/migrations/versions/2023/11_14_1041_3dbdce90d0e0_fix_code_uq_constraint.py diff --git a/src/dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py b/src/dstack/_internal/server/migrations/versions/2023/11_14_1609_686fb8341ea5_add_user_emails.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/686fb8341ea5_add_user_emails.py rename to src/dstack/_internal/server/migrations/versions/2023/11_14_1609_686fb8341ea5_add_user_emails.py diff --git a/src/dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py b/src/dstack/_internal/server/migrations/versions/2023/12_11_1034_e6391ca6a264_separate_gateways_from_compute.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/e6391ca6a264_separate_gateways_from_compute.py rename to src/dstack/_internal/server/migrations/versions/2023/12_11_1034_e6391ca6a264_separate_gateways_from_compute.py diff --git a/src/dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py b/src/dstack/_internal/server/migrations/versions/2023/12_19_1555_48ad3ecbaea2_do_not_delete_projects_and_runs.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/48ad3ecbaea2_do_not_delete_projects_and_runs.py rename to src/dstack/_internal/server/migrations/versions/2023/12_19_1555_48ad3ecbaea2_do_not_delete_projects_and_runs.py diff --git a/src/dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py b/src/dstack/_internal/server/migrations/versions/2024/01_09_1223_d3e8af4786fa_gateway_compute_flag_deleted.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/d3e8af4786fa_gateway_compute_flag_deleted.py rename to src/dstack/_internal/server/migrations/versions/2024/01_09_1223_d3e8af4786fa_gateway_compute_flag_deleted.py diff --git a/src/dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py b/src/dstack/_internal/server/migrations/versions/2024/02_12_1427_27d3e55759fa_add_pools.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/27d3e55759fa_add_pools.py rename to src/dstack/_internal/server/migrations/versions/2024/02_12_1427_27d3e55759fa_add_pools.py diff --git a/src/dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py b/src/dstack/_internal/server/migrations/versions/2024/02_14_1139_29c08c6a8cb3_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/29c08c6a8cb3_.py rename to src/dstack/_internal/server/migrations/versions/2024/02_14_1139_29c08c6a8cb3_.py diff --git a/src/dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py b/src/dstack/_internal/server/migrations/versions/2024/02_19_1139_9eea6af28e10_added_fail_reason_for_instancemodel.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/9eea6af28e10_added_fail_reason_for_instancemodel.py rename to src/dstack/_internal/server/migrations/versions/2024/02_19_1139_9eea6af28e10_added_fail_reason_for_instancemodel.py diff --git a/src/dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py b/src/dstack/_internal/server/migrations/versions/2024/02_21_1011_1a48dfe44a40_rework_termination_handling.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/1a48dfe44a40_rework_termination_handling.py rename to src/dstack/_internal/server/migrations/versions/2024/02_21_1011_1a48dfe44a40_rework_termination_handling.py diff --git a/src/dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py b/src/dstack/_internal/server/migrations/versions/2024/02_28_0547_ed0ca30e13bb_migrate_instancestatus_provisioning.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/ed0ca30e13bb_migrate_instancestatus_provisioning.py rename to src/dstack/_internal/server/migrations/versions/2024/02_28_0547_ed0ca30e13bb_migrate_instancestatus_provisioning.py diff --git a/src/dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py b/src/dstack/_internal/server/migrations/versions/2024/02_28_0615_b88d55c2a07d_replace_instancestatus_ready.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/b88d55c2a07d_replace_instancestatus_ready.py rename to src/dstack/_internal/server/migrations/versions/2024/02_28_0615_b88d55c2a07d_replace_instancestatus_ready.py diff --git a/src/dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py b/src/dstack/_internal/server/migrations/versions/2024/03_01_1430_4b4319398164_introduce_runs_processing.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/4b4319398164_introduce_runs_processing.py rename to src/dstack/_internal/server/migrations/versions/2024/03_01_1430_4b4319398164_introduce_runs_processing.py diff --git a/src/dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py b/src/dstack/_internal/server/migrations/versions/2024/03_07_1721_0e33559e16ed_update_instancestatus.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/0e33559e16ed_update_instancestatus.py rename to src/dstack/_internal/server/migrations/versions/2024/03_07_1721_0e33559e16ed_update_instancestatus.py diff --git a/src/dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py b/src/dstack/_internal/server/migrations/versions/2024/03_12_1717_555138b1f77f_change_instancemodel_for_asynchronous_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/555138b1f77f_change_instancemodel_for_asynchronous_.py rename to src/dstack/_internal/server/migrations/versions/2024/03_12_1717_555138b1f77f_change_instancemodel_for_asynchronous_.py diff --git a/src/dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py b/src/dstack/_internal/server/migrations/versions/2024/03_13_1048_5ec538b70e71_replace_instansestatus.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/5ec538b70e71_replace_instansestatus.py rename to src/dstack/_internal/server/migrations/versions/2024/03_13_1048_5ec538b70e71_replace_instansestatus.py diff --git a/src/dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py b/src/dstack/_internal/server/migrations/versions/2024/03_18_1216_4ae1a5b0e7f1_add_run_list_index.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/4ae1a5b0e7f1_add_run_list_index.py rename to src/dstack/_internal/server/migrations/versions/2024/03_18_1216_4ae1a5b0e7f1_add_run_list_index.py diff --git a/src/dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py b/src/dstack/_internal/server/migrations/versions/2024/03_29_0637_99b4c8c954ea_add_termination_reason_message.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/99b4c8c954ea_add_termination_reason_message.py rename to src/dstack/_internal/server/migrations/versions/2024/03_29_0637_99b4c8c954ea_add_termination_reason_message.py diff --git a/src/dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py b/src/dstack/_internal/server/migrations/versions/2024/04_02_0142_866ec1d67184_replace_retrypolicy_limit_with_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/866ec1d67184_replace_retrypolicy_limit_with_.py rename to src/dstack/_internal/server/migrations/versions/2024/04_02_0142_866ec1d67184_replace_retrypolicy_limit_with_.py diff --git a/src/dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py b/src/dstack/_internal/server/migrations/versions/2024/04_08_0802_1e3fb39ef74b_add_remote_connection_details.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/1e3fb39ef74b_add_remote_connection_details.py rename to src/dstack/_internal/server/migrations/versions/2024/04_08_0802_1e3fb39ef74b_add_remote_connection_details.py diff --git a/src/dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py b/src/dstack/_internal/server/migrations/versions/2024/05_15_1104_58aa5162dcc3_add_gatewaymodel_configuration.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/58aa5162dcc3_add_gatewaymodel_configuration.py rename to src/dstack/_internal/server/migrations/versions/2024/05_15_1104_58aa5162dcc3_add_gatewaymodel_configuration.py diff --git a/src/dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py b/src/dstack/_internal/server/migrations/versions/2024/05_16_1418_c154eece89da_add_fields_for_async_gateway_creation.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/c154eece89da_add_fields_for_async_gateway_creation.py rename to src/dstack/_internal/server/migrations/versions/2024/05_16_1418_c154eece89da_add_fields_for_async_gateway_creation.py diff --git a/src/dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py b/src/dstack/_internal/server/migrations/versions/2024/05_22_1338_dfffd6a1165c_add_fields_for_gateways_behind_alb.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/dfffd6a1165c_add_fields_for_gateways_behind_alb.py rename to src/dstack/_internal/server/migrations/versions/2024/05_22_1338_dfffd6a1165c_add_fields_for_gateways_behind_alb.py diff --git a/src/dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py b/src/dstack/_internal/server/migrations/versions/2024/05_29_1040_29826f417010_remove_instancemodel_retry_policy.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/29826f417010_remove_instancemodel_retry_policy.py rename to src/dstack/_internal/server/migrations/versions/2024/05_29_1040_29826f417010_remove_instancemodel_retry_policy.py diff --git a/src/dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py b/src/dstack/_internal/server/migrations/versions/2024/05_30_0955_b4d6ad60db08_add_instancemodel_unreachable.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/b4d6ad60db08_add_instancemodel_unreachable.py rename to src/dstack/_internal/server/migrations/versions/2024/05_30_0955_b4d6ad60db08_add_instancemodel_unreachable.py diff --git a/src/dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py b/src/dstack/_internal/server/migrations/versions/2024/06_26_1122_98cd9c8b5927_add_volumemodel.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/98cd9c8b5927_add_volumemodel.py rename to src/dstack/_internal/server/migrations/versions/2024/06_26_1122_98cd9c8b5927_add_volumemodel.py diff --git a/src/dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py b/src/dstack/_internal/server/migrations/versions/2024/07_04_1726_5ad8debc8fe6_fixes_for_psql.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/5ad8debc8fe6_fixes_for_psql.py rename to src/dstack/_internal/server/migrations/versions/2024/07_04_1726_5ad8debc8fe6_fixes_for_psql.py diff --git a/src/dstack/_internal/server/migrations/versions/91ac5e543037_extend_repos_creds_column.py b/src/dstack/_internal/server/migrations/versions/2024/07_14_2143_91ac5e543037_extend_repos_creds_column.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/91ac5e543037_extend_repos_creds_column.py rename to src/dstack/_internal/server/migrations/versions/2024/07_14_2143_91ac5e543037_extend_repos_creds_column.py diff --git a/src/dstack/_internal/server/migrations/versions/3cf77fb8bcf1_store_repo_clone_url.py b/src/dstack/_internal/server/migrations/versions/2024/07_15_2309_3cf77fb8bcf1_store_repo_clone_url.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/3cf77fb8bcf1_store_repo_clone_url.py rename to src/dstack/_internal/server/migrations/versions/2024/07_15_2309_3cf77fb8bcf1_store_repo_clone_url.py diff --git a/src/dstack/_internal/server/migrations/versions/c00090eaef21_support_fleets.py b/src/dstack/_internal/server/migrations/versions/2024/07_17_1543_c00090eaef21_support_fleets.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/c00090eaef21_support_fleets.py rename to src/dstack/_internal/server/migrations/versions/2024/07_17_1543_c00090eaef21_support_fleets.py diff --git a/src/dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py b/src/dstack/_internal/server/migrations/versions/2024/08_15_1024_710e5b3fac8f_add_encryption.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/710e5b3fac8f_add_encryption.py rename to src/dstack/_internal/server/migrations/versions/2024/08_15_1024_710e5b3fac8f_add_encryption.py diff --git a/src/dstack/_internal/server/migrations/versions/54a77e19c64c_add_manager_project_role.py b/src/dstack/_internal/server/migrations/versions/2024/08_16_1425_54a77e19c64c_add_manager_project_role.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/54a77e19c64c_add_manager_project_role.py rename to src/dstack/_internal/server/migrations/versions/2024/08_16_1425_54a77e19c64c_add_manager_project_role.py diff --git a/src/dstack/_internal/server/migrations/versions/d6b11105f659_add_usermodel_active.py b/src/dstack/_internal/server/migrations/versions/2024/08_19_1510_d6b11105f659_add_usermodel_active.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/d6b11105f659_add_usermodel_active.py rename to src/dstack/_internal/server/migrations/versions/2024/08_19_1510_d6b11105f659_add_usermodel_active.py diff --git a/src/dstack/_internal/server/migrations/versions/ea60480f82bb_add_membermodel_member_num.py b/src/dstack/_internal/server/migrations/versions/2024/08_21_1420_ea60480f82bb_add_membermodel_member_num.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/ea60480f82bb_add_membermodel_member_num.py rename to src/dstack/_internal/server/migrations/versions/2024/08_21_1420_ea60480f82bb_add_membermodel_member_num.py diff --git a/src/dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py b/src/dstack/_internal/server/migrations/versions/2024/08_30_1342_7b24b1c8eba7_add_instancemodel_last_processed_at.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/7b24b1c8eba7_add_instancemodel_last_processed_at.py rename to src/dstack/_internal/server/migrations/versions/2024/08_30_1342_7b24b1c8eba7_add_instancemodel_last_processed_at.py diff --git a/src/dstack/_internal/server/migrations/versions/c83d45f9a971_replace_string_with_text.py b/src/dstack/_internal/server/migrations/versions/2024/09_10_1107_c83d45f9a971_replace_string_with_text.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/c83d45f9a971_replace_string_with_text.py rename to src/dstack/_internal/server/migrations/versions/2024/09_10_1107_c83d45f9a971_replace_string_with_text.py diff --git a/src/dstack/_internal/server/migrations/versions/e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py b/src/dstack/_internal/server/migrations/versions/2024/09_17_1223_e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py rename to src/dstack/_internal/server/migrations/versions/2024/09_17_1223_e3b7db07727f_add_gatewaycomputemodel_app_updated_at.py diff --git a/src/dstack/_internal/server/migrations/versions/a7b46c073fa1_add_placementgroupmodel.py b/src/dstack/_internal/server/migrations/versions/2024/09_25_1352_a7b46c073fa1_add_placementgroupmodel.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/a7b46c073fa1_add_placementgroupmodel.py rename to src/dstack/_internal/server/migrations/versions/2024/09_25_1352_a7b46c073fa1_add_placementgroupmodel.py diff --git a/src/dstack/_internal/server/migrations/versions/c20626d03cfb_add_jobmetricspoint.py b/src/dstack/_internal/server/migrations/versions/2024/10_14_1126_c20626d03cfb_add_jobmetricspoint.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/c20626d03cfb_add_jobmetricspoint.py rename to src/dstack/_internal/server/migrations/versions/2024/10_14_1126_c20626d03cfb_add_jobmetricspoint.py diff --git a/src/dstack/_internal/server/migrations/versions/afbc600ff2b2_add_created_at_to_usermodel_and_.py b/src/dstack/_internal/server/migrations/versions/2024/10_16_1431_afbc600ff2b2_add_created_at_to_usermodel_and_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/afbc600ff2b2_add_created_at_to_usermodel_and_.py rename to src/dstack/_internal/server/migrations/versions/2024/10_16_1431_afbc600ff2b2_add_created_at_to_usermodel_and_.py diff --git a/src/dstack/_internal/server/migrations/versions/82b32a135ea2_.py b/src/dstack/_internal/server/migrations/versions/2024/11_04_1546_82b32a135ea2_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/82b32a135ea2_.py rename to src/dstack/_internal/server/migrations/versions/2024/11_04_1546_82b32a135ea2_.py diff --git a/src/dstack/_internal/server/migrations/versions/91a12fff6c76_add_repocredsmodel.py b/src/dstack/_internal/server/migrations/versions/2024/11_14_1031_91a12fff6c76_add_repocredsmodel.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/91a12fff6c76_add_repocredsmodel.py rename to src/dstack/_internal/server/migrations/versions/2024/11_14_1031_91a12fff6c76_add_repocredsmodel.py diff --git a/src/dstack/_internal/server/migrations/versions/065588ec72b8_add_vultr_to_backendtype_enum.py b/src/dstack/_internal/server/migrations/versions/2024/12_24_1256_065588ec72b8_add_vultr_to_backendtype_enum.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/065588ec72b8_add_vultr_to_backendtype_enum.py rename to src/dstack/_internal/server/migrations/versions/2024/12_24_1256_065588ec72b8_add_vultr_to_backendtype_enum.py diff --git a/src/dstack/_internal/server/migrations/versions/803c7e9ed85d_add_jobmodel_job_runtime_data.py b/src/dstack/_internal/server/migrations/versions/2025/01_10_1417_803c7e9ed85d_add_jobmodel_job_runtime_data.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/803c7e9ed85d_add_jobmodel_job_runtime_data.py rename to src/dstack/_internal/server/migrations/versions/2025/01_10_1417_803c7e9ed85d_add_jobmodel_job_runtime_data.py diff --git a/src/dstack/_internal/server/migrations/versions/c48df7985d57_add_instance_termination_retries.py b/src/dstack/_internal/server/migrations/versions/2025/01_14_1333_c48df7985d57_add_instance_termination_retries.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/c48df7985d57_add_instance_termination_retries.py rename to src/dstack/_internal/server/migrations/versions/2025/01_14_1333_c48df7985d57_add_instance_termination_retries.py diff --git a/src/dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py b/src/dstack/_internal/server/migrations/versions/2025/01_16_1459_1338b788b612_reverse_job_instance_relationship.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py rename to src/dstack/_internal/server/migrations/versions/2025/01_16_1459_1338b788b612_reverse_job_instance_relationship.py diff --git a/src/dstack/_internal/server/migrations/versions/ffa99edd1988_add_jobterminationreason_max_duration_.py b/src/dstack/_internal/server/migrations/versions/2025/01_21_1053_ffa99edd1988_add_jobterminationreason_max_duration_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/ffa99edd1988_add_jobterminationreason_max_duration_.py rename to src/dstack/_internal/server/migrations/versions/2025/01_21_1053_ffa99edd1988_add_jobterminationreason_max_duration_.py diff --git a/src/dstack/_internal/server/migrations/versions/da574e93fee0_add_jobmodel_volumes_detached_at.py b/src/dstack/_internal/server/migrations/versions/2025/01_29_1152_da574e93fee0_add_jobmodel_volumes_detached_at.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/da574e93fee0_add_jobmodel_volumes_detached_at.py rename to src/dstack/_internal/server/migrations/versions/2025/01_29_1152_da574e93fee0_add_jobmodel_volumes_detached_at.py diff --git a/src/dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py b/src/dstack/_internal/server/migrations/versions/2025/02_04_1110_51d45659d574_add_instancemodel_blocks_fields.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py rename to src/dstack/_internal/server/migrations/versions/2025/02_04_1110_51d45659d574_add_instancemodel_blocks_fields.py diff --git a/src/dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py b/src/dstack/_internal/server/migrations/versions/2025/02_11_2230_63c3f19cb184_add_jobterminationreason_inactivity_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py rename to src/dstack/_internal/server/migrations/versions/2025/02_11_2230_63c3f19cb184_add_jobterminationreason_inactivity_.py diff --git a/src/dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py b/src/dstack/_internal/server/migrations/versions/2025/02_11_2337_1e76fb0dde87_add_jobmodel_inactivity_secs.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py rename to src/dstack/_internal/server/migrations/versions/2025/02_11_2337_1e76fb0dde87_add_jobmodel_inactivity_secs.py diff --git a/src/dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py b/src/dstack/_internal/server/migrations/versions/2025/02_12_1319_a751ef183f27_move_attachment_data_to_volumes_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py rename to src/dstack/_internal/server/migrations/versions/2025/02_12_1319_a751ef183f27_move_attachment_data_to_volumes_.py diff --git a/src/dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py b/src/dstack/_internal/server/migrations/versions/2025/02_21_1059_60e444118b6d_add_jobprometheusmetrics.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py rename to src/dstack/_internal/server/migrations/versions/2025/02_21_1059_60e444118b6d_add_jobprometheusmetrics.py diff --git a/src/dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py b/src/dstack/_internal/server/migrations/versions/2025/02_28_1512_98d1b92988bc_add_jobterminationreason_terminated_due_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py rename to src/dstack/_internal/server/migrations/versions/2025/02_28_1512_98d1b92988bc_add_jobterminationreason_terminated_due_.py diff --git a/src/dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py b/src/dstack/_internal/server/migrations/versions/2025/03_10_1449_bc8ca4a505c6_store_backendtype_as_string.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/bc8ca4a505c6_store_backendtype_as_string.py rename to src/dstack/_internal/server/migrations/versions/2025/03_10_1449_bc8ca4a505c6_store_backendtype_as_string.py diff --git a/src/dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py b/src/dstack/_internal/server/migrations/versions/2025/03_13_1113_7bc2586e8b9e_make_instancemodel_pool_id_optional.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/7bc2586e8b9e_make_instancemodel_pool_id_optional.py rename to src/dstack/_internal/server/migrations/versions/2025/03_13_1113_7bc2586e8b9e_make_instancemodel_pool_id_optional.py diff --git a/src/dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py b/src/dstack/_internal/server/migrations/versions/2025/04_15_1800_7ba3b59d7ca6_add_runmodel_resubmission_attempt.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/7ba3b59d7ca6_add_runmodel_resubmission_attempt.py rename to src/dstack/_internal/server/migrations/versions/2025/04_15_1800_7ba3b59d7ca6_add_runmodel_resubmission_attempt.py diff --git a/src/dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py b/src/dstack/_internal/server/migrations/versions/2025/05_09_1025_6c1a9d6530ee_add_jobmodel_exit_status.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py rename to src/dstack/_internal/server/migrations/versions/2025/05_09_1025_6c1a9d6530ee_add_jobmodel_exit_status.py diff --git a/src/dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py b/src/dstack/_internal/server/migrations/versions/2025/05_13_1624_20166748b60c_add_jobmodel_disconnected_at.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py rename to src/dstack/_internal/server/migrations/versions/2025/05_13_1624_20166748b60c_add_jobmodel_disconnected_at.py diff --git a/src/dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py b/src/dstack/_internal/server/migrations/versions/2025/05_14_1524_bca2fdf130bf_add_runmodel_priority.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py rename to src/dstack/_internal/server/migrations/versions/2025/05_14_1524_bca2fdf130bf_add_runmodel_priority.py diff --git a/src/dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py b/src/dstack/_internal/server/migrations/versions/2025/05_29_1530_35e90e1b0d3e_add_rolling_deployment_fields.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py rename to src/dstack/_internal/server/migrations/versions/2025/05_29_1530_35e90e1b0d3e_add_rolling_deployment_fields.py diff --git a/src/dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py b/src/dstack/_internal/server/migrations/versions/2025/06_06_1304_35f732ee4cf5_add_projectmodel_is_public.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py rename to src/dstack/_internal/server/migrations/versions/2025/06_06_1304_35f732ee4cf5_add_projectmodel_is_public.py diff --git a/src/dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py b/src/dstack/_internal/server/migrations/versions/2025/06_12_1228_5f1707c525d2_add_filearchivemodel.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py rename to src/dstack/_internal/server/migrations/versions/2025/06_12_1228_5f1707c525d2_add_filearchivemodel.py diff --git a/src/dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py b/src/dstack/_internal/server/migrations/versions/2025/06_30_1100_644b8a114187_add_secretmodel.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py rename to src/dstack/_internal/server/migrations/versions/2025/06_30_1100_644b8a114187_add_secretmodel.py diff --git a/src/dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py b/src/dstack/_internal/server/migrations/versions/2025/07_15_1426_d5863798bf41_add_volumemodel_last_job_processed_at.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py rename to src/dstack/_internal/server/migrations/versions/2025/07_15_1426_d5863798bf41_add_volumemodel_last_job_processed_at.py diff --git a/src/dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py b/src/dstack/_internal/server/migrations/versions/2025/07_17_1547_ec02a26a256c_add_runmodel_next_triggered_at.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py rename to src/dstack/_internal/server/migrations/versions/2025/07_17_1547_ec02a26a256c_add_runmodel_next_triggered_at.py diff --git a/src/dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py b/src/dstack/_internal/server/migrations/versions/2025/07_25_1036_50dd7ea98639_index_status_columns.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py rename to src/dstack/_internal/server/migrations/versions/2025/07_25_1036_50dd7ea98639_index_status_columns.py diff --git a/src/dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py b/src/dstack/_internal/server/migrations/versions/2025/08_01_1456_728b1488b1b4_add_instance_health.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py rename to src/dstack/_internal/server/migrations/versions/2025/08_01_1456_728b1488b1b4_add_instance_health.py diff --git a/src/dstack/_internal/server/migrations/versions/25479f540245_add_probes.py b/src/dstack/_internal/server/migrations/versions/2025/08_03_1951_25479f540245_add_probes.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/25479f540245_add_probes.py rename to src/dstack/_internal/server/migrations/versions/2025/08_03_1951_25479f540245_add_probes.py diff --git a/src/dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py b/src/dstack/_internal/server/migrations/versions/2025/08_06_1349_74a1f55209bd_store_enums_as_strings.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/74a1f55209bd_store_enums_as_strings.py rename to src/dstack/_internal/server/migrations/versions/2025/08_06_1349_74a1f55209bd_store_enums_as_strings.py diff --git a/src/dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py b/src/dstack/_internal/server/migrations/versions/2025/08_11_1323_3d7f6c2ec000_add_jobmodel_registered.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/3d7f6c2ec000_add_jobmodel_registered.py rename to src/dstack/_internal/server/migrations/versions/2025/08_11_1323_3d7f6c2ec000_add_jobmodel_registered.py diff --git a/src/dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py b/src/dstack/_internal/server/migrations/versions/2025/08_15_1126_e2d08cd1b8d9_add_jobmodel_fleet.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/e2d08cd1b8d9_add_jobmodel_fleet.py rename to src/dstack/_internal/server/migrations/versions/2025/08_15_1126_e2d08cd1b8d9_add_jobmodel_fleet.py diff --git a/src/dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py b/src/dstack/_internal/server/migrations/versions/2025/08_29_1608_2498ab323443_add_fleetmodel_consolidation_attempt_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/2498ab323443_add_fleetmodel_consolidation_attempt_.py rename to src/dstack/_internal/server/migrations/versions/2025/08_29_1608_2498ab323443_add_fleetmodel_consolidation_attempt_.py diff --git a/src/dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py b/src/dstack/_internal/server/migrations/versions/2025/10_09_2031_ff1d94f65b08_user_ssh_key.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/ff1d94f65b08_user_ssh_key.py rename to src/dstack/_internal/server/migrations/versions/2025/10_09_2031_ff1d94f65b08_user_ssh_key.py diff --git a/src/dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py b/src/dstack/_internal/server/migrations/versions/2025/10_21_1601_7d1ec2b920ac_add_computegroupmodel.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py rename to src/dstack/_internal/server/migrations/versions/2025/10_21_1601_7d1ec2b920ac_add_computegroupmodel.py diff --git a/src/dstack/_internal/server/migrations/versions/06e977bc61c7_add_usermodel_deleted_and_original_name.py b/src/dstack/_internal/server/migrations/versions/2025/11_26_1143_06e977bc61c7_add_usermodel_deleted_and_original_name.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/06e977bc61c7_add_usermodel_deleted_and_original_name.py rename to src/dstack/_internal/server/migrations/versions/2025/11_26_1143_06e977bc61c7_add_usermodel_deleted_and_original_name.py diff --git a/src/dstack/_internal/server/migrations/versions/006512f572b4_add_projects_original_name.py b/src/dstack/_internal/server/migrations/versions/2025/11_27_1511_006512f572b4_add_projects_original_name.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/006512f572b4_add_projects_original_name.py rename to src/dstack/_internal/server/migrations/versions/2025/11_27_1511_006512f572b4_add_projects_original_name.py diff --git a/src/dstack/_internal/server/migrations/versions/d4d9dc26cf58_add_ix_jobs_run_id.py b/src/dstack/_internal/server/migrations/versions/2025/12_04_2048_d4d9dc26cf58_add_ix_jobs_run_id.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/d4d9dc26cf58_add_ix_jobs_run_id.py rename to src/dstack/_internal/server/migrations/versions/2025/12_04_2048_d4d9dc26cf58_add_ix_jobs_run_id.py diff --git a/src/dstack/_internal/server/migrations/versions/5fd659afca82_add_ix_instances_fleet_id.py b/src/dstack/_internal/server/migrations/versions/2025/12_04_2052_5fd659afca82_add_ix_instances_fleet_id.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/5fd659afca82_add_ix_instances_fleet_id.py rename to src/dstack/_internal/server/migrations/versions/2025/12_04_2052_5fd659afca82_add_ix_instances_fleet_id.py diff --git a/src/dstack/_internal/server/migrations/versions/22d74df9897e_add_events_and_event_targets.py b/src/dstack/_internal/server/migrations/versions/2025/12_04_2056_22d74df9897e_add_events_and_event_targets.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/22d74df9897e_add_events_and_event_targets.py rename to src/dstack/_internal/server/migrations/versions/2025/12_04_2056_22d74df9897e_add_events_and_event_targets.py diff --git a/src/dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py b/src/dstack/_internal/server/migrations/versions/2025/12_18_1054_706e0acc3a7d_add_runmodel_desired_replica_counts.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/706e0acc3a7d_add_runmodel_desired_replica_counts.py rename to src/dstack/_internal/server/migrations/versions/2025/12_18_1054_706e0acc3a7d_add_runmodel_desired_replica_counts.py diff --git a/src/dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py b/src/dstack/_internal/server/migrations/versions/2025/12_21_2208_1aa9638ad963_added_email_index.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/1aa9638ad963_added_email_index.py rename to src/dstack/_internal/server/migrations/versions/2025/12_21_2208_1aa9638ad963_added_email_index.py diff --git a/src/dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py b/src/dstack/_internal/server/migrations/versions/2025/12_22_1217_903c91e24634_add_instances_termination_reason_message.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/903c91e24634_add_instances_termination_reason_message.py rename to src/dstack/_internal/server/migrations/versions/2025/12_22_1217_903c91e24634_add_instances_termination_reason_message.py diff --git a/src/dstack/_internal/server/migrations/versions/57cff3ec86ce_add_computegroupmodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/2026/02_18_1107_57cff3ec86ce_add_computegroupmodel_pipeline_columns.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/57cff3ec86ce_add_computegroupmodel_pipeline_columns.py rename to src/dstack/_internal/server/migrations/versions/2026/02_18_1107_57cff3ec86ce_add_computegroupmodel_pipeline_columns.py diff --git a/src/dstack/_internal/server/migrations/versions/9c2a227b0154_add_placementgroupmodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/2026/02_18_1108_9c2a227b0154_add_placementgroupmodel_pipeline_columns.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/9c2a227b0154_add_placementgroupmodel_pipeline_columns.py rename to src/dstack/_internal/server/migrations/versions/2026/02_18_1108_9c2a227b0154_add_placementgroupmodel_pipeline_columns.py diff --git a/src/dstack/_internal/server/migrations/versions/a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py b/src/dstack/_internal/server/migrations/versions/2026/02_18_1122_a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py similarity index 100% rename from src/dstack/_internal/server/migrations/versions/a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py rename to src/dstack/_internal/server/migrations/versions/2026/02_18_1122_a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py From f7303650d173c9d48aed74241f582ee8b461483f Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:52:15 +0100 Subject: [PATCH 147/187] Clarify why GPU vendor default inference is split between client and server; add TODOs on how this should change in the future (move resource defaults to the server). (#3588) --- .../cli/services/configurators/run.py | 19 ++++++++++--------- .../_internal/server/services/resources.py | 14 +++++++++++--- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py index 0322cac229..6fc427a388 100644 --- a/src/dstack/_internal/cli/services/configurators/run.py +++ b/src/dstack/_internal/cli/services/configurators/run.py @@ -391,10 +391,14 @@ def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None: Infers GPU vendor if not set. Defaults to Nvidia when using the default CUDA image. Requires explicit `image` if the vendor is AMD or Tenstorrent. - NOTE: We don't set the inferred vendor on gpu_spec for compatibility with - older servers. Servers set the vendor using the same logic in - set_resources_defaults(). The inferred vendor is used here only for - validation and display (see _infer_gpu_vendor). + When vendor is inferred from GPU name (e.g. A100 -> nvidia), it is written to + gpu_spec. When vendor is inferred from image context (no name, no vendor, default + CUDA image -> nvidia), it is NOT written to gpu_spec because 0.19.x servers + (gpuhunt <0.1.12) break on vendor=nvidia + min_gpu_count=0. The server applies + the same default in set_gpu_vendor_default(). + + TODO: This entire method should move to the server (set_resources_defaults) + so that defaults and validation are equal for CLI and API users. """ gpu_spec = conf.resources.gpu if gpu_spec is None: @@ -439,11 +443,8 @@ def validate_gpu_vendor_and_image(self, conf: RunConfigurationT) -> None: # Set vendor inferred from name on the spec (server needs it for filtering). gpu_spec.vendor = vendor else: - # No vendor or name specified. Default to Nvidia if using the default - # CUDA image, since it's only compatible with Nvidia GPUs. - # We don't set the inferred vendor on the spec — the server does the - # same inference in set_resources_defaults() for compatibility with - # older servers that don't handle vendor + count.min=0 correctly. + # No vendor or name specified. Default to Nvidia if using the + # default CUDA image, since it's only compatible with Nvidia GPUs. if conf.image is None and conf.docker is not True: vendor = gpuhunt.AcceleratorVendor.NVIDIA has_amd_gpu = False diff --git a/src/dstack/_internal/server/services/resources.py b/src/dstack/_internal/server/services/resources.py index aab47de21c..8b38f92f4e 100644 --- a/src/dstack/_internal/server/services/resources.py +++ b/src/dstack/_internal/server/services/resources.py @@ -29,9 +29,17 @@ def set_gpu_vendor_default( docker: Optional[bool], ) -> None: """Default GPU vendor to Nvidia when using the default CUDA image, - since it's only compatible with Nvidia GPUs. - Mirrors the client-side logic in validate_gpu_vendor_and_image(). - Should only be called for runs (not fleets) since fleets don't have image context.""" + since it's only compatible with Nvidia GPUs. Only called for runs + (not fleets) since fleets don't have image context. + + The client infers the same default for display and validation + (see validate_gpu_vendor_and_image) but does not write it to the spec + for 0.19.x server compatibility. This server-side function is what + actually sets the vendor before offer matching. + + TODO: All resource defaults and validation (gpu vendor, cpu arch, memory, + disk, etc.) should be set here on the server, not split between client + and model-level defaults.""" gpu = resources.gpu if ( gpu is not None From 9aa8b58c0d65d4b6c2ad672e341862914c83d6b0 Mon Sep 17 00:00:00 2001 From: peterschmidt85 Date: Wed, 18 Feb 2026 17:10:32 +0100 Subject: [PATCH 148/187] Only cosmetical changes to the website page (title; description; styling). --- docs/assets/fonts/Geist-Variable.woff2 | Bin 0 -> 69436 bytes docs/assets/fonts/GeistMono-Variable.woff2 | Bin 0 -> 71004 bytes docs/assets/stylesheets/extra.css | 46 ++++--- docs/assets/stylesheets/landing.css | 133 ++++++++++++++++----- docs/assets/stylesheets/termynal.css | 2 +- docs/overrides/home.html | 50 ++++---- docs/overrides/main.html | 2 + 7 files changed, 163 insertions(+), 70 deletions(-) create mode 100644 docs/assets/fonts/Geist-Variable.woff2 create mode 100644 docs/assets/fonts/GeistMono-Variable.woff2 diff --git a/docs/assets/fonts/Geist-Variable.woff2 b/docs/assets/fonts/Geist-Variable.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..b2f01210625c8cc9939508fb1f7214d21eb41357 GIT binary patch literal 69436 zcmV(~K+nH-Pew8T0RR910S`O?6#xJL0+v7k0S>?b0nE(+00000000000000000000 z0000Qia;BJkO~~BFh59EK~j`|KTTFaQalD=KT}jeR2%@1crSht2nvGSe1e^PFoW1^ z0X7081D-Ssj~oC5AU|zobYU+Ak8B6XR}2SRy={^MHOG$E22La^^Uv|LZC^Z83fGb<)%gL*$0VV9%i-7I`4rRWSSN{M1|NnO-Ut(;(x1+b)-~bU(K~=-7 zs`_83n2BX!7WeR4$Sj#nE=>d(tO?mlw${YEw%_!1=?CcOTbGs%WduTRL?i4*m?%;7 zeNnkVr?^>+84eaO#BqjRAy5{WC#eZU$AFWeox#B5c%inv&TxcT`UY+?-Nhsi@QP+B z$4-iL3p59pa;k2v*3~H3r0gE;yckSwu3X+SJ|*W2?~p4QVUVJ?kBw{(l#d=h=YIIA zRZ`uODydSp!etC}-q;c_xpDTL!#V*0tE@7!fC zzvAv%ZziA8h4}&fG>te?{`&jf|MpU+x2$|!E@|aq>O#dk(}!$YO(#&Uvde23lZs+1SI=T^S|nrciw~<-hvLWfLwBMnqg7A|f^oL}+HFnVG&#ewBDj zGbXG(e8@fhOMMB59wxgF;@dx(V$48TwP|M|o7bL*iDgOU+FVk}r>{B3`K+uxcE zHnvd%0TmP#P%-dMEcA?1JjD~aXhlyi1@$7;tF$h>@}h*-e!~C~k(UZZk6-kpMVKkG zK~JnbYo=qYNkCbAGEEWn?Y&r>|+70l(&Huw!Kteo`A|V_ONX(1~ zmC%_PBQtYE=A4lu=FBm!Ir5S-uXA1_a^%RE8FS{i%*cot5uuS8ImevWIp;N>&v~75 zT<3Glah=cWb#D71H|t+h*$x$TIs>#0jgUZy5<;+mhXSRPiq4AIalhXFZEyc%M^WpD zceKL)e4GCNCV>b^o3=@5nyMV}UNP?#{k+fo@2--`>wW$uF6nJOLMVX&hG7^HV;BZa z@>5QGpD?yV_?b%#Y57VdC56MSJ@FZUKXSfz*D=Z#jYgvZ()_t$Bwq;phA=@&^ zq{vR}I#8kv_03ygtfDON=lt#0p8W)(4nwCYPMcDirim#;Cp-I)qM5?bI_J=$TUcr* zJgCdH6gEa&?Sp%#iYcwG5w8lQ$W@=d0;ND(pcH5ev=8~o?A+wZkKSqZs;i}L0a;dT zIdRBICYfXr?Y+iLOjO*d#C>Yo;{$^h-}B0G|(L2u6ueP6I8MfeLw#B zy^IG-@S0N4p(eIviD&rd{Sz;g;ER;oWYncNgr4PrdZB-v?3|jKMVPmR(qIteckzf8 zg)j)MkARxC3#~{{nn58P%3uVW%NOZ>5at^pwA$d2+Nys^Dg_iwH`B|S=>JDmJ*n!+ z<>W#P!23g4e_pk2Zh^ck*^Y8hl&iIqa89hr5Z!}cC#h5~K`Kc@BT~h;7qglC&-{HE z)5xJt3r&O90grCcAc*YNb2^`iPv*g$x;GAGB9+* z6z)`D2V8AmZy{1ur6d(nvMqn&VOWbEIz8F~zYIfOGryl3SrG+feQwAXV|(+f*mN0g zz}CD4|KGiiEi}SZ_#TgOI2=lioxJex%ld0}tfX}uEWZEysrDbzq`>%Yfp~QMgxid) zp4z^)@`E1S*-%s8lVl5A!q`bZaM2t9)!o0$<@eo^om6#8<0ZZmP+kNcYsj4NBG^~Y zbqVRi4T$MrJBD0bJOzW63!b()Zx<{M16&AzZVJ1O*)$4HTIdf*I-~3_0t#T@&)Uzt z)?aXuO#&=4i^{FjW6y9F^{Rb7Vk?VdYJAePgHUNZAPNH{+=aQkFjuI|{r{g@(trI{ z8A;pp)+uu8DAi6^D&11qB!K0F5KErPNdBX9#@6f}{V7|jwUcH$z2RC>sQ`|)l!{U+ z;SHe9LQxpc3Can80T_e@vYKpsv!-wgp%BKw&`VX`okZa<3d{d*)3$CEGFPQD#t zCrdxeMdvuc)Ggr5ee-k@PLHEq^owOaa!@Xe52v9P)=CECH5$`YpH5Rs=<} zRhjQpyIXBXP_m=-X` z^Y)9MmgHdAD2Q;9HT2FG<;fLN)mYn~FIDaO9|#abx$KY$oiC@2wti~cDCI5(POIA9 zuXPF_%nU$k08sW|s4@U0?*NjM571>q*&7ZhDH6$KB`Q_kBFibVoT4b+mij@;_K1ph zhm?|ql`XWkw63kSp7ylI+>TpMt=#T=IPUSf_dV~TPG2b+6iB_MagveSddpqjF7L{&t{#lmVpWGrZ-uKssMV|v$#!}T$Uwj> zXu$GdGjWp9&Z#~%M*nLZ0IZ&ZrM6yODv~sjY^u);`ptcmXzRjUNY!F>-~bQ^;M=Th zyM8#qq@i3$^}2!e^)F`E^MeK}$~p}0A_Fu8C?J>LuYcp8fBYwoiF3mCeBq<T2J_P2K*zc&}j{`uGBF)iK(mj1;ltl1D@OI@dD8GHcxC0k5%#z3seN zt<$LAEId7BlvhRTy3pZ%VYCiuv`1%BQ=0lTraf!(`Bl4^#V=_Yt8S0X zHp~d4jWyMDbKiU3)LVHw8{Wu9H?z4dZ+kngy5p{U?t9e(Z@Sq^2hT4~R>sbnJ@4a* zbAGw=@&_M&L<&zN(IOzBV_;IF&!VVFu~Se~QrEL$)zF2|v?VP)BQrNI05mYUppaxN zCXTfvk)aO~N-X)OnmU4rDN~*bR3x0pVu&MqJ!Y2`r3JlDxW6Cn_x4xzuibywQ|F+2 zuzs+4aNWVpV=3LSKMu#e@y2*-ygj~fe9`!-$?YRPIhZ_Ux){frYk#^n-I-o9y>|Mt z^Gjye&aR)m{9>{Hct5k+8tcye`O17}e*XM|`Gxb#=Qqx8nm^_6;gkb<3HpZA=ES1A z=Z@WpJ9XzSiV_vBrBiS2N$D5hgLKTF`YE;sKZzV=lEEaH1~9j{QZ1~F4f4Lb zJs+Bntn96zzVe$S3!I=q-8r(e_1YrQzP+@)gtb{#7im$HWsb@ooOK)LoBr19FVDSN zUT?vRe;-?M=$l3xupk%$F1}L79T5M5i?8W=4{|?J^r_ZQ-Te}xej@qTl75fCC?GJ> zchCs>X%2$^5zMs^AYe93$C)cWu&|Yb$w#-=;M;GnufaxVZDM5@_2E_)CcCAr!Q`Sl zIrycTVJBiinR)Tama#`mfMBseK>&qC&Rhhiw6W?w>;rd$vC>VDZ(1|?H z&at*928J<^tak0SOS6EpHL&H46JX@x=uO7v$0T4qX0Npu1=-sMwU~U!No>LVQ*4;s z5Lkv;D?0^Fq8dge=DIgRD}>6DGOoXOBedw>Z5@x_A+|nhRW9}Ys^nbb#qDj?KsTlL zOv%U`TJcDiZ?1!(qT$qRfB){4Te+7V3JrA8XKZe#H3U%#NXxL z(^gN%i#5lE6k8l|$CrWDiuWqJdiu5d6Ua!$GevIvW;%3{6Uj4D4kQLDy*Nf$tM#jA zDe>}QgYIPm8@m|yY&pp!!=TgEq+y7)h7>o3E8NIIqI!C;1|?{FVZnMD z85&qvd)fgB|4*jh?R|au$!i8wSFi$X8x0xg&D<#XbKAF!7FxS0aWh+?AZp@og{g!6 zMsBIO1sMdrhwrPn^HSxch0@H#nVTa3UuNg9q39=8Qr_v#womF1CB4+*DoSuCx)v^Z zTb{yMCYie)zD27dB(xaqT%uMi0~(a+$OEJ_AFdmfQN3&5?~N*ub-mZR(h>;pWjN3F(XwWx+joq{#m$#0^*y+ljyssJ)ez^{`D?g|< zk+uih9caN8nX&4P23dPx#PSbprz`Equag+7FT19LE&bj_&Z^x+vci%w3RD?D>1t}G z=GH%~^)+X59lJ@VF1>l!Sb(l#4a2R9;kV}@LM4V}_iL zB@(2*n-nz!zfACSah?DbeFEMhKxCkF9N{KjlQNPGq!5``PGq^HN#&weX)ILZGA?ej zzRY8+VopCij58sx7GM@27SG13XH7B8;UzD5yZ!c(;^6EV34eO}X54@y6fW)vc1Cj* zyzOKj;GhIwUiJy3C%~CgW4LzREIRCo+azgQAI}08a%DCM$F6 z#31fsi%tJ`L<-kYp40961X~^ z_zyKo=8j@RO2c8}Mpk){>yhvpXfo#H%)U0(Vxgu3nP)nuL3N*T6b=-w3O&Fn z*%uK(On{3#l?LD)#_qojd@;~myoafv8w^g5C-?n^rkoSE1SD1JSc7ZxBEnZRVqLn4 z3va(^&$$|zGSigE3h>cWvdVO3>F)|8G9bNK87lRhH!-Y&0bPtHDbK{jpWtUPU5#t# zx2%b^Qr=$xIguOy+?AOIt>fnfoBM-C*?ZcnW6&k<-S{0VpQ|j)=h>O&+kNJewpt7B zBBU&BFL4bF66zCj{42q2ROy+1jhP4KSJ41ais9UkE4KNM^(*naNeCZSj)RUYr9XoO zH;va$?$c*!dNMZoV(@Ro|MY?5dno|f=U&>(f0GUc95!I^$L92)S)W7#cJZ6a1C{rp z#Tmi3gNIh`lc_u>F!2c=yjC1yv3~wuE=UUhkd7aM`MU#W1aIkoE@$T?i%P6N3$_X^ z(q9~AvWNs^Aw4Kadh+++we!oY5WGg2_tV3m3diCcAAO-v#0B9lEBwwZy!1k0Hxu>0 ziOm%d9Ke|=9;BBW^bSl@vgS7^8w44FiWMiWY|m5{|K6^s9G>#~!Xk*u|FwAp0TsLY z5ETrI^|!ql4Eckl#gX#g#*30d!LN=9Y(X;X<3!<0KG42{|j`tj*hdZdfy{9~y>DLo%!*_$50B~kz%-;ob<=613v2C~;H~>dgPnqdk zUlBFdF+e!0dRdap}_Y zMleDUD#DB+U>4R&D;Nw0gApnttR-Z@!cX8Oyo1m1UHlZk7ymSdtD-Ooro$XqNLw{p znxD8nuqAfFJ~%8r0U~2xmWjZ)8nMwPe{{iQim>HZ5U#4kNs=bZ9!#^4AZ)>c5QK`b zlCa@~1FqneR0goC9JtOHoLcK5BCA{3$U!dha2Z1w79$TGaF9LphWzJ-kYdLP*v%<4 z;WEjoR7uAZgLE?wQD7XAGR&eg!1A;@gk8o>;&5KD6VF_6nB&1k5Cps{kP7hW)7MdY z`i^EW9V3%FR-<)XMutMexmwmT&exgxxX^+Z%te?yMGJw|tU`@QeCK5e5=I&7=`&n^ ziY#>5u9X_`=#EPUFwLj1p>42pPT^c>39mB52G=Tx+37ghRRm&ok4}5k-f9+dV~^^k zeRKc^9Yu(gnuk#vUCkOd#J^4>_1dh{>y|CJkVEX)z(F{7wR8Bj?Ou~kFpJn6Bm$E!Qm!VR=RAE%U zRcQj|swNeGYi9xG@mo}Yk8Zj1uup58KMRcg?-NJHvg`$MxiM3y`^3PyzV8jEJ29qd zr-E>yPho1HueW5PpD|cy;|>2EEc4#?j5Er}u5TgMOi*5;j6-NCZhE#HF>PXYzY{l&}BEkk!49;>t`7oJCgy)1_ROaGMN|enO54&p4R*`*7|d9COW~Lb^%$yG?q@RhqfrUY4v4L zoFm0Q9%U@X&(P!{9er+}za_=oUGw}Sb%p*F!NvYth8X_8r5f)ccjN$GC`K7uxs@{U z@-$&HWU6EvsGKUeCXcEuP`IwJn^04Uh_$(Cb(D%;_vV=OwmepSt&Q8zO)ibfc_yQv zr24Ibkprh&pg9LihaBda`norbwYkm(9xi#zc|LJUYKR(I7uJXg)m&#aA6U_;Z#yM5 zL=CMAYs7?VuCuO%OY>Qa18L?6t4nwEW}+yfE&7kN`O!;>9~|(Z?OW&DFo`v@qiTP|DxN5I53fs8FRw9hZhhG-=VM zLzfc@DonftiIOBsamQWv+&35$Ny}IAB>!f$JM2X-8EBBfh8kwL z5k?wiv@y0j;;3WJKJmZXMI%OCmNjlN;sQ|3%|1AiqxQO5T7TbPtKMuzbfN0o1ta`#u%I?Hhr@t z0`^liLZ_#_KY-)@c0Y~EztWQaH@^UB<{2ahG&~fLC}Bcnrd6C&o!($FTdX!a)2vcW zN6)~>#0+9#WdpNAI5@eW+&sK|`~n(8#WagcNSgMDM?L0oPk7Q(p7xAqJ?E6uW}I=> z^UgW%qDwA&&FkK9#Z}jQ?Hk{AeL@OKDjF6rr?7}dZI)Zrz}wn`=?k`Y0%_)fj7-cR7FISeJA{Li3(C#I%f~OEK~zk$xP+u> zk9gE$9`}SNJ>_Z7c-C`HIc>%nXFczn^DesNve&%s4Od)s&DXy1ZP%BOf|81c1+K_Ekv94*}sK( z2U~Z2cXTp!1+&@_`+c4Zv=KLNoC4wl{IQZ5(?9r^Z?<1A|Jj(xAx_Np5Ib}e;|JDy z5}CuaYgLXWC#7;{v3DjjXY#i%uDxr@aR zXV}r1nAsl6Vxq1hF`E^N{>|4CY^SW_hL8*7dC8C%9zh-&i^7fUj5dw4!=W6XZALia zZ6qn=d7f281A9J~hx!Tf*m+&2q8;307AT%Ac>7Xv1p4d3whp164X`Lv)?mmWk8N%yWUw}4%Tb_F zg$_NYQd3g{3pI6~&SS&Lh!_7^Ye*Pw9R((ur#P%xvKmWfEwaL9u_R=eHhL!cr!5rE zLdm9D7~v&VRWl^I)_G4{19R`y5m$XHX)xeHd8Hp7Re!VM&iki?a7c&N1ZHV9o53bX z(RmgMGiNK8D$k-i<(AvE>(b|-qfWihtrYdqX3M5#^kUTn|2*i!+Jv)Iy!2dDWfx5| z(ImxV2D3G@JnOXtgskq)L1&n^Cc2Pc5fq~cJ=a4X+ku887PPHl3lkjJIVl{f2W4if zp`7H)O0guc?7C%EM4OuEi-Xy;drYw$;x^}rjZo}yW`a&;_@-jcvXZUHCRovfj693R;y^Qn&xD7FIZdZ58jv)d z!6PqQOChS7tEnDAM_pUF@kH&kCTT>@d{!Jvg(qXRH-ovc?fGh4_~hJR9UNs;aO~?& zZ4>i^SfhjO_#OqC+qY*s3f#{bg3UFD?hu{)V0fJK#GS!jLO3I#(rY&2l}kj?qG+i^ zb+ukADBfx!4J{3Al4y&pj`b`*rT5hhRBPD((VsuAWAw1!s^cR**w!zcl`b^gxglYb zF2<&?$7OIx1$pe5Rv~-A40gp%=wFy3w!#fdy^E^0VR1u~ty)jsojvw1l|AJ6^Xfw6 zmKb(J6ZDeC@;7?o8zx=+DTWGGS&XQ+dXiDnHe3Nfttw-;WLr;-11BVLtZR3yXZv?( z$9HBICT_N#h!froSLh?{y-oSjl#q3~SKv1n^~5g&|J!k%>%Y;Py7>r@+W4!bekQT4 zdGy;fkQybq4t#WfnYi62PYQ?3A?teL*+6{kYQHeOAQ)CE;bqcb@!LQiR`dbV$FWH1*c;*AK2x!Elsgu&^#H87^GGH>F=t00DCWdfT6#CSLzg54I z5~<<)KXmCxdd-iCK>hYWY-`0xuIPMC$wrku0vRnIzC+0V5Ag5y?sJE~OC^0V%&DkZC#N<3FDg{|mk@WTDf^F+=#;`gQo;9* zQ5%?$!YOgHg_}8C3nZ2JfdVB>hbJTu0px3Gus=_Z2iM%SrhJz{oH!$(_-VB6Wop!i#qD~0=>oNTgEEu>IFeDe+#vo(h)|w99@74YZ)j+r z^Uwiz>F&6ne9A9!h7=TJ4fib-J+^{WA;m}y(xw#YG^p0FQEevZbuu#PHH*^E&cn#h zkDyXng{&dvcp6_f2oC?w{efZQF+V8=KEoHUTyOkwzy&1|mxVp-36B z&_aX?BZx6$gb5RfDRaaM)({)EWNhJp#gQY#i4zUZc0zVJjX1*_;=>2w%O4Wp5+YC_ zBuE%x;W9B~xlGU%SBSZ)n2-|Xq*SVd)N3Q4T@RwyEp)ftM%-~9^1uVcLysX(JVW#w z#4u!ppi!fwJU33jgbBo?Npw@DFiiUm^9Kj^H+pcSEkQh`R)eD?3lx2MNQAEt6fL^J{){<}bUxP)m1$?d zdFdSDxor@Ud}j!y{9qV*KDY#&FGgYV%@{=Hj05MV2~_#ZBnA0f4mkfz1Cwz84-f=F zfG9>v04YUEfha@DK-hCgImll^E&-uKbRbrNQ~*+i)Fm1;G_+A;l_o8jR;`IPZB;sS z)YYjo(XG3#9z9ih_15dtm+9AEp8*5)1`Q>Kjnpw}wBERh#H7iLo<3t_O3a!qm@}7} zXUCD&5>g;DHfOG@hO2&Epk?nNGTUrjZDtDxyvX#IUj#6AW*GR z8yOVLg+x+Ov0PldfeDF(M$4t78jzN*$;hPEtX0U$)~w4VHsmX8+N{~KRc6~xV%KhJ z&tC4pL4`wy1xK!?p7%oHMK3n+B`+1d0z$oRr@R3R-$eD_!u8%Jwcnw&-eoE8@uK%d z%@1VVZ>p64F)EH(4X(2*Zg85v#cSoBs`?3%|4>uo5t{uNi++M9KO>`0sq}MZMTQfg z0ESF}r%Yi_n<1RDg4AM3bh(UMsn}7i;?`<*)a#gy2B_J@Y_&k`j)bKpTer6C+}*{0 zIxX#)K7>a;#Dg>?B^DMHRfITkNNAX%K_DnBEG#e>3L6^>M~a9&NxMKuC6HbyZak^O zkaF829PuLL{scbjxF`i)LRCmR-^+?nQxr&?3#;G=Z!LmV@6xF$L<0Cjufj9CE`~)r z{9dIfA|tLWOyzDzy3eb4uy-u6re=~T@e#8Frvvy6@`Q0}hE{ z0Z0MJ!9zubFeDmcXjy`{8I>x7wr?esu7vI=o>tqCO}l!_utP|e5HU!C7$hfXy?YfX zY$IqdcVj_b_HXcxWm@hL>Lx%MzwFJRuM^Z zk^yl97?e|&L`iIF!9=)N*dSloHsBJr0hfr4P>Fh@!dB1)5@JJ2auuRg zoW_J}@`Vkl1vTUuszPWFPfvKL4&ekQ5f&~@4F}q~08C|U_{t~-r?Hz>i<2QOUAy1#E=*C#at5YQbL3isyvDfs(mpOq(q7)jPjBgygwx) zIRz?<4OJ(5K&XSkt3WTl7Q8c$Ly?-LmN1gdupu+13@B1G*r>uX_^mh79PF5k($cz~ z`Sjjt%g#M%DxT+(`W7ou1|u^PGBYR>cB6?Vm}t7uW}9o0vNbV&?|^zYX}uRZX~2?? zlLi9$q^3A+nH2y@oYG&PQI?3%7k)k4U?>RvGXNtgtvU6N15`3j5NM(9VHf4cSg6m$ zqHd%;_-Y*1o%Am0PM?R>J$s%?>*qoF3o!l&J^M$bn4%5`VQpG^wyUF)d7q}L>!Xr6 zfb!m}2aM`kJ10p4D0+A3%m*-hQ=Jj8hEs7yiP|4bSP$dFksLqeR?iftkGX;8Lia&{ z2i+^~jIUn~Rhqm0W&(=x_}=kEspI(rq2dp{czWqyB6vi^46~&IDL!6C8w0*~-0o3) zl&>9+U@=bu)$mGqH*v2QQN$k4;wHO1fn)6SFj~0Fempp?(_XgPiAEl`4a01<1#ULk zglaZg4;33+bKc%?QQp(wfL-TPh6mjB6vM-vPkv~DRnCEzi=7#sgo~U4b9;dkQF-O9 zE{IEwAzwop6yoJO2t|rddenfp=(hHV0E%11ljlWw)+l#Fja08AfunQrcj1~r<`kdL zwF8k0U4latA9_@M^XU-R0y34PlCRASfZ{Om6r*Ti$IrWIhvCUc-*FN zb?=m$Tm3fUMiymU3zd&6)zeqqc-kJTX1QFby!fmCMXZqSwyZ4LdC-N}NasF(0qdIN z4p@#X^rGyAU*iE5$cv!fq9wm2+5}#RlP1cB4)%)+5xiLODm7cvK(w|U~tj`8zs`IPyDwcW^4vE>lqKY*=SfK=e zNp{A)J)yFJ?l++p&bkX+8iJ#}TB&SrXEo2G66`l}jcsZ1Y|9kH@A&4#*!r*{eX6rQ zh(I&nRYjn)8}RiiyTqMMdyaRJ4-T-tPT#wlwb;olS59@&ecJ(jZ{k9$Hn`kzZsz@~ z%g*RQTRWXQ3EA{Gen(g1PP!avwTp-PN>M6tcpf&his0?jVu_7CEIH*$j|nl^`6hbz zg~{8{%z2CM*dj<5s1b}D@CIo)JG`10xGMp_>Wlwb$qZ{8wHQb58poXRfN+jENM`2{ zFeMY1a!-f8Q5=1?-pq8+WnfVPLFC3$bK8=aH3T{;>~$AiVr1* zDUU&Z3X^VgTJG~n^I?0bZ%^I$RFXC)f-)eup)wncC`#tdt-ipX7AD5ihjZC~8 zqAIWVPT^5g;6b%sCS^#dn&7KEBEm1l_i(bZS2(LWS3I^+0;hop8jJih|b(w%fO zKQx?25x^<}0LmFG8dHKf3Y$jFq+0!nhAm{B;j1{eo$s{RX1r>x%k%+|e$AG+_);Yp zf*AISmxip$KXrm%wV1N&*A8|#l8aG>Qt+ota5>h?D7qFpuu0Kc_iaL3NQ!iC?5m?k zHf_)EM_^T3AsrTw7GAhY{G|O+GTiu>tfkafctKmDkwg;zYBCwd2PDxd_ z5q8GfwquT09ur-65hL{LT3D`DaIxmF`R1~K;2!+K`mW7fzm(oBnC$q@VwdG2Y`DK8G*hi7aDLz=tG(nb`EMZoqq?Vwl5>vCG ztKG7i6x@z+7<1z97ntZ@wYGGg@!1EH);j0ayF6r#StN`>k0F%gw7UaEj(Nb)wN7^{ zY=ecZ`rS9!YrgJ;-R2-I*KJyi!Mj9t@)VP?MHxi{1Czt%f4Z(MB1X=P!(`)@2^y0z zaM|4xQ-|rkVzrp6@mgsnjfV|d$9S&QD#mlA!sd1b zi*{*Mr)*IfZsQge1ARYRQN9!gJ=cdH1SXJOd? z3jrNuKr;H{X%*RWaK-78C3aV?>*R*zy@aS?jI))K77x@qi>z*3D#`q8jR*B_jDpb> zeJ4Zm!f-phh)qRT`QSUEaqYn8OAdW7Hik)&!urtTPOgs{q;PF$v2|7*Wm}xXB8y`x zk*JA$_c}Y-dgR1a(_W1j03x4-17j1R&}euuAiTuEh`}Q~7epXRgcONRrpr3Ok_~Yg z5+srkL15lxK2D#7sk^*Sp}hi^^naX!9%5gQX$RffdrOioEAa_-0A4Z!TQGjoK%;s> zHuUwx+tH$CXF92cVdPSlp`C1qK&il@A||Vt)h_L=q6-hPw-=Y?8bgVy2m3>TRLg`C zok00J%xe{+#&-md@e!c-o|K@UB!uhvgs7h(4zd71{G@3_#eb*M6pZV5RGnX|*o)pc zt>mY+Yw>R=kt80D?P96bmvZ83Ax?OB0P+cAq^d8 zyi7xy1{h@OQV)n7^^s6%Dmxk6(7&T2D_ay64BJO0Z>HKvd_C^iNBLrq!4yvEnGAbL zCqh)V%j6sAKD zEy+?}IC381JLm8(mH_9Tx4hyBEJpqblw=#Jx(9eGAUgAsh!Kst#$2L1QjI#f2Fj@> z4nWG<;8bambAP6T!(I;wP~Ww8aW=^?mK)&HaF0#sPq|HgvQ>;ZXylq)P#3)_I(&5N z;DBqz2)*AKj&O96{gACWKX)`19$sB zgGNYFu5uqbf`^w~@mvxaMjyPJJw2Rb7pUWFM3NIZQR}OekoUpuWiO1iT$Z%0qG*#2 zH()%yWUZv6S&1B>HfVH$g{u_UP`AaNDrmt;%NMSId%nXr)!LdlX$9ug7*8iqUFojD zolx^D?C}||oxOVmbY3r8%;=JlZ8oz2afIp>d8$-1clzN6X&wHM*fj`IN`-s^Kv{@2 z&@e%8hDih};BNCxVHXm-OI?X=EYPZwwNPsa`s;txo4jB7W0#sq%oQce!XGA9goRaM zI~%N{fqix>J7{M&z3gY;_~u8sz)8+>A)H(eFV`3hD@3Z1X5EJA&9b2I6d_4Ddw${v z{@`vpQ?t353DW!eRpVizAaNNUrBxn(8wLf z|3>XwQ~&&P)z13=_hwqNKWMhC;7_>UgIp38BEy`FT;mkvCa0p1EJERUFN(xs6vawI z(FzX&6!gM9sGR>*hu-FjYTZePe4}m))eX=)e~l0mGi0LY49t*+4--Q(F(w03;B`I> z%#f!Q|5r;<469Der3fb(h~zWSKB7FJb(m;^Ar)eC+{gqiov0<+tTq?Pp~NK*(msjEp=o3hS?EZHZ6gWNeHvxJ-t zfk`h>tq*g)U0XP}^F*L!H0BiYYQv#uonuXBB6LC7H@gX|S}+#D+FcQha@G8$Cv!>Bs$Q4+V3$U@0VcN)jFR}y+28FHY^>rHd`37h#l!LBpxZE{f!E7PUJ*vFM*30I5 zCPm~a&+;74^8znoATkH{ETS4~My{RO5&|X-v-l))Jr;AiUcjzlX_y$DC$83F2+br% z0Mk?JQ)E`!a3Eh|h5|Bz2$G$i_qtxyS@GF_Pr0%?Yi&nWl`({0$Bo{6Q0W z&=3xI#D8vy&0xj!m>DIRow+H?{47jGDzhZZps@vl^w2;fO*GTO4O+QLTLRsqLNvM| zA=hc)2W;bqZ08Mj@FQCJH`@3yJ9(3K-eMPT(-G&Pu>lfthfaRNZr-Jf_vq$*dia1| z_OO=^>Ek2z@iF_O1_pox2_y;u0~rC_|Dccp(MWO7vnx{M6a~XD2fa-pAB}{4>Y!JX^h%<-=IY(||Lf-7Uus~L&%pf_CS?YUF!pewRy(xuVvB|Tz!w5Ly%Ba1>3 z#o8>!^+>mw5$)xt#N;89;OV=uEd2fVMkV*{Qad4eqfD3Er zGZEdqYCE_-K1?DVk~WrfjzAG90uq!^K_7IW3pMmZFEe0)Uwmc;pZp4F#A=OeO0(7~ zs3nE9rcGBHiNRs&-Dnbq;xvbx>q#zjv1hu}>xso!+-aBlJmd+_Ipk*NTJAy@`_`l` zk76#5FX$u&k*QOeT%lB{3&mu%dP6I7aklVB_EwhdxSk(`QJkb%UX)ecv|T?ij2wMg zxBbv9Iq8(s&JmF)tJSDg=R>x@IKNTphLUe4^Txk@!FR?fW6;(a_;{>1&C$znzBQLs zUPYBvRb5TB)m3i^{V)EI3K3zVGQwhO-l`33_10|d1~;?|ySPh8WiO9^Da1A)sJ`v= zek5ZyiV9d@bw>pv%oQ{^j$*Jy(DvL&SAhHAYo7%>@TDpRRME7$M=fAf0dDT=R21T< z$TlNAsf~duDL1YtD>;U`QYKv+bOZpTLEa0?YOT$rn%)hw;C);jBLU}Y2QDk*6w zfT7d=K%Lt9N%>*YE^2<8&Ln|m+NCNYX z6_s~q!`^Sr4_e@dEb_ya6g<6SWCo)%8K2F>946;7wIb6iF}n(LtFpKTOKY;c7AtGB zx(=WA4ZiAoR$;wDmkX_@+($k>ZZcgc8-ASV(1>JGD{({qxLhG{~r@r50l1 zvGugl!7KufJO;vP80%tf zfY_svq+tab3{<^p_s7kCUZTZFlqX+-aw4?4+e;rbX9h+BNEnHjC(i9X*?K85WXaZ~ zRi{1n+V2Pj)o6kl<^@GV<<zvHeV;!p-Y@xDdH~@ylJ!8rKB2 z210=GmoE;A$FKC09g&0qQ!e8tkxP-(5j zbVudI^hVP!rZ0tz!B~nlFrH@F{7P#Ezq8FA{^UC4E^)P6mGS;|4aZD#5y@1>9z(mS z$1T+?h*T>*4fkr#*y%cFxy6gJb*w{MxkX3X?^dr8clm+bF}7i68Q$%eyvJ`3Wv@57 z-AIFdA=_lrb&a@#x$Kbc(&N3of}YUG)*JClZH?`~?#3%be7vHLg`U%0j$h1cd2GCH z(k_P^?aR+u{`g*k^T|$!4JbgrjuK_6)$!mPf;ph7=NqCnf3J^qt?T7iP+>(CS5j#W zHP%#fE#0WKo3-_)s}9BndF)grX+Kx$%USHy_%#rOf*F zsj&4TLM4OAjT*$lylsG*pu#YICR_LUK{=BNYPFginY-6k-sE3RJOI zx=8|i*DB7tU$A}}s4;}6LB!&Et^Ty@-d|DBKuy$tFUpxW8dhrBu$8m2w#dh>o9lMR zo#ZFE?%jJY-H$woc-wmSc*s7&9v}3#_s`b9-xKGP=l#gjHP4oAuHE{}I^1gAHWs8k z#|vwB%sUUd%j7d9P3`88d6{|TrS+=d3c7-?+O9fpxHp0u)osUZ=NH z`=R)O^H}`YqvokHRiQ%aQY~K-=p{N)r|2C0I{V;w@$jC|T0Kw08T3F04}7S?DUiSm zYG@9Fa4sleFPh>oKA)=7OSv;w^IB#zm*rAY>~gHAC3#KWFWlLC3ri433SIQEhkZPT zYJF6*x>`&1`|5Vxp&Am&AeZ_ypmVg(m3*Y_HQ83BP8boTB^?Q6S$^@^SA#C

    <+} zV(L`CrggiU`%1^h`2fp|);K%cr#pjrYN|84Ty|#K+U8qieH+@h+QHlRMqAxW{>;z& z@lR_QLLbU)Pgz=$MKerrz=beNo5?4na#1gtkh;r0Q7L!0!ZKSeypCt=?&geW6r&u& zj;?&sP5cYzGXM==$1v?`F?bXn|Kl3vj(_tvZVFV@loTKs0xrEk)@_P!7juWMyCL*G zVNXhV#?5D>JSXsFO|N+I8U(+u=tCVJLd8c)J~sR@qiRUXuqrar-L*Pp#xQqE$b;wIHV% z>=$X9qpEkN1n1^>Lg!%iZlDb_d9xcJ3d6nP?7zIF_vM*e667lJ^ppb>^0 u|F(! zt3?p0Hj@Y;UOE>H7^bO&M+D$D#qJ(J+XpRIZ5Qrbs}*hH3hTX^tZ;0`oV7bI7!D$2 zxPD;d5_4ObaYdcIfmi4INfyum*gTGvLwXNsyI&=MMa#hy>}E=xoegdleXlobSuvpE z{HS>RlEzBq#+}`GOHoOKwdqtj_9Ph6ZH~oJsz#ws*XTK)S<6V;XOA;3zSBBXa!{}$ zo@*%_VkDTme?}vXZFNGJBcmgd)D^>@yfbfGG2p}pSeS-t5{3+mYPB4hl>x?F9im5m z9BnK~zP`yJA8fJj)!|0UC?@#I@Xk>ss!EJqc3NbSlU8bIE}>6IXWycy`rn$GXqU8F zuiFV9yw&5jS)Sf|brKDXZBI%WbOyEN@u1E1S^$hm|~wO<2N=_RX#(m{nA3GlTfU3KW6aq;MXJ*otAa z;~&J%oohZuI&3MavWhw0$>3O}7r=v`PvEi_gJ5a+JoFoTb9*^Zsd-@aX`FJHuwF}b z`Ds)OTUM3NEk}G7QKmjFVLuq)&)Pn|&$7&U*`vzfu5{C209N6}D{1iDZ{vYs%ZAT>KH82G$Ivl(%pD6;6D5Pi zxcEE%e?x(25CeiCE+l}Gp>#+ZZO-T2IO5E`c=9lhiDkj8mr(KnH*j!|2mFg4iY&HK z1z{S@fOox$Ml+S%-EAR4427`LRKulc#D(1PCN{@~XKo1X`@O+S}L>g79S*>c5P2KX!cT+m0 zc`a(0Ax8bBt=CqF3eh1bFND&rs=MkD;~ejk-F(s8?dtm-oVGt-x>W9Oywf-SDdpd> zqYA&AFRq(_U;qD&E#mF}v42;M5iyH@(-#Tk4;=RYplz*VO$)*QehTpaNg+Dxe+LRT z@n3%l(`t_WoHM`vQ`4{Cn%1AD`@f$Gkdx(;-6lUcbK` z@l7qb(B`+e)Oc1ZYUu+4!czq>;d*`HXgxYOt{gmo@g7q68!HEQWY#T5ZnyVw^|;Q_ z+fKU1I-*{Y#^*zkVo;`F<-gAE#*t=azWNb`un zn52l1By9mbs?=%Q@b2yZ%3H|QLA&j7%Km2Mq*xK6M7zd2EqpWOonImnQbbfPK>SD; zX-bqNSqxe&tX_kDt+VNKnO%0<;}&^th3z}k3<*D&{bLP)*^>tH{>#dIP)<4+qyU{P z!7zoWWg#p{1!gH@S#T^(C98sGWooew6RXhK6b9=;z#=Sc!(&^7*hh?Gq&P%^Q)IYA z$&RSuMa9l&IT0_%-@BIGzjlW^9 zO$yd&DZKgNqN-!H{bMfPJ$st8YSXSkqmErn>mW5nw|=mIPE6xA?c~&qE}}y!z75l+ z)kwWMwOV|3U1rUhx4_H-gCkfODshpyf5HX?UKF1QP0|X~szqU;_uu`)UKNyJLvr%! zkbAA5dn|y~5`;hBs(l(oG)v$;T)~aI+W|B?~l@4O+df5QsPHq zyeUi+1&O36(G(|U@~d3Q%2vM8$tLfo)r{Y(9qQ;VU3|&4zMJynu49R{tZfam%r)mW zFLGgvTKJ+Dv6zJ{Xu(t@5d=&DL)iE+uO%F~<#O2%adDPK#59#g>9iIB-m|8+#K>@X zM$2@C%xp#YSFBmZ8MVDNyVXX$WX)-<(dHPQ+d7NYnHS^QOb6KefVq%NDaErnY7 zlxdc(J87Q`v&z^^r;x=6G9=p9Q`Fpfk})~7LjqwKz$NR;+aV^O5sqMF!B^kYvN^W`bF;DS{jn$Mn z5JP#k#h8vv4v)*xoer+HM2EM|_i_i<28B^#WV6MX3Uu8-AD5w`RVzz<0*uD5G(&Cp(w0mnQ}W_?YQXxQ%}0&fq37D)} zQwsPns@DLClnSIBaKH#P#sYM;J&$Rq;N56I-hh40=F@ui;|OL~+7f@R4YlB_K|vBO8sm)~8@kGcw2*lC&z)xv!o4a{;EE)Le`vI7?Ik) z`V@>#4+$x5m&Y+;egF2Dn4?|y<^U6ToFrB(tIiy-{|N8u1fvN~z1P7QxtCYXn8JwH zcJsfXcl(`sd>rBw>5RzH;-#qUch+cFoSkt&OQYrSPxt@<5gZdHK>bd|E>t|rtszct z@aztfk$kcVhm6O8(=lZ!OX|}L#%+ZfHZ%l>45pvBv_5Be-Vu+nW}yNW zqF>7f9^CDd6X*(*;Yl8c#xWfJVlYbM(fs*5bW6{>8Ob8sO_^7d^)~P2IS~g{d(@ll z#+xj&F}7ZNl$47ZKFJx_@!MIY`AY2YL%5VbqivznYmWqu$bFbcuro%9+k4sR*&;W5 z7fFNY^kI{*6D;BP(}BJiF7B9j@!@$>EG6gICkNc!?e7_#pAK3;FsD&_HBvP0LzE#K zqmw8@J!Lz331XSw0DgEb_Y#(j)s!t4D?Asc2VMpCojc-{eyekWENPNc*`FoGzOcRZ2K)4ZUhE zGW-{yzJ5;$HO+R|f^^13;yC-0bg1AO4(hpew+NA;vlmuG)Ne)ONZ(_F8h-Lcc%*I5 zW+JvMNobOf2oHB2ow*D#d8bzZBT5Ka4~uHs`Odk=ll-U`7s%K`3#mvpaf${IYKp-? zQ43^}c6?X;fZ*>f3d0KexLEX^>D^=(&bb&RAFAvRJiEvnU32J7xlxfp8thc*D?P<` zy7)LTgxT2T(S7tK$e^OoEmQP)TjUQm-tzv4fp7#~$CSQk+je2x$b%K0e%{1|yD?YJxZBZ^h}YGOi;lOHBP zHy;kz_;7)K#^W>gjwY_!2pS?zqzyhUg@irjVQBfKqeNc#F0wcbix=O9Kjp}0^LU+K zVx5Ny$|3*)&lfA*a~8vzqaZE}V_zCu6dmq)6CZSIjt1(hSS;oUl0e->wm?lt&?09^ zd^+Mykc@PTi#`w~faYw}RmhVqOcd;BHBWX7CB2rChW`eU3N{Tav0$L$1W%Pe4;C+| z+RnI0=S_9Rhst(M%!e=5Gb$xKPiV}`?quSv)0~1KtJsUKMO8qWkt8X7)yG|ReEGxETy`X8#}ma8rr@k#nVj{A9Qht(@B$fd(jYrv zN#u(X+v4N5!jaPQ#CI2qqbzrve?AguVsEV8Rq!Qp382X23F_GC5?$&AHTNKOl6KxQR^_4=)w!G@5IVr1K}kF_0z64VyJ>GJ|z)f^q22o4qv(DuEr2 z0gD`Rq6-ES0Rpt48EU6i*4nDkf3j;N%ee*@DdikbuIbqnn1FXSZB4z7t2CV%U&CPr z{Wj@IlWB;~zM#Fg>tx=+@oE;vX>z#aoE$6}l^M$E+!{rd`fe7e-V6S$)KZKn2wA0z zRBG2CyN1*yhy1-X3a>{B1%-ce3D+nj$xzuo_CiXSmY>Jz&&WmuS)&Z2`C4kF1s~zK z*1viCz*g{T82^Y7YxHPbkJmgf6=V7B5|?kCVDFj(izTI54@-c9j;v3P25NhFgU_T| zqQaI113{r>5`oS52oV@GsPu@8^(||0$41PKno~Q-1EN4X4e=TOZ_FxKPq^Oh_Dz(y zPf4}>gZ7N(2S*oyPpLM&mTRtVFC!&J&KMwJ@o|!n*~je|`@xF(PPS7?T?+~P7<7G0 zq}~cn{~?a4QiYTUtIqEryctkZ^A~MF8YPb^+VG%{rl7eHYYtaPS3`oH(zaUDH@M+e zfH)VWL@723y}<_9d1yK7YhTGh0Q)xAgQagOlQP|W&1I!OU7@%ZlbMP+QL8)NGge6x zOPw;}Vfmndf8-MT-IBCnncE84z*E15l?~`UK40U5%0e5B3FW@I{b<_XbYJxG330cA zsfUhVLmF*+ywg8m`Z}REQxmmv5W1;A1uArF@~DUCUwIWH$`e-g{U>WKcTDX_i*#*E zdsDp95!<|B?Go^>z&1-t_xyCq(UxgYNG36F0*ZUck^sS4Zpsxk~^oL zyl!^MCUofmXBvNJil#CFgLSdgTB+n@j|ye72JsQ7BGSwwZ~DV=+kW+KJQNtibDn6x z_?H8aQgPlx`mglF_NFgImhOy!{P;5EpS=a>Q-G9EXWylKu!&}J{gB1S^v^}$%awd+ z&5~{-p(&`KA7;m^N^sVXp20TBRUJ68yWRR*4FlO*T<~ltY|3+<6keN!K(@Jkgdfc5 zVYcRDPSeKamKTa0`N1QEZ2CtaC_qo7l4^JNq8bM%SDZr)FX@B9%BcNeWKR23)ixP< zUcU5W{Ve-{^V{^?;C-g2_4k-D@fg{4>@H^zhkpR<-A{~bqn(cwhT0*wq7_$}S&Ul@vY*pR?qa`YgtZ5}bn7~T!!JNud`$^CP3 zj&Y;CcWl8>q^nZS_zQ<+NOeT@CU_TQNZwpyET0>3bL-c_i2mer0AL4#~tW*We@I-ypsX`1$M@0qYY`h!5tG?RLVU(zHK)b+?k0 z>QB6O1hS8*UWPI9Wrfam{~j|GZl&El@^XH5+l+X=7V|7kI)(gTTJNOOK_U01Zox9Z zzUpoFSdg86KYygzSCa^qyX@U&#>07mHS%Zg0Fv%P>{-f;y_;g%u^|MXuLCo${JMvJ z`lnuw;ng;2YRbl?{FIL!r9&aP2L*eVzAp<7Rtvlmp7GnL#UXUgHTrq(?C`~}$XazV zR*XO00i++#a*hBQWv7eEn4RF|b?(V{Oo7vAIIUqUT!-2LTcPiip0t%4H+D7h$Ni{?dwctpN@O%OyP;&nV+D&Gf)oP!QK0~mg@Xk?$&K4Wbv5uRbFw& zF8k#xPo+PtWVIRFyeJ{@4Ca+djLCQ74s`3{x|lO!0_sV)C$pp3NnQ_uyZ{=3?*zJw z;=4EH=|+GqvpWWi)y?^ydzgT>#VJ`!%#p25p(yh`%0!A752<3P(^FKxY;;X**3WoS zpkt@RAMheXDk|qvFIf~p;;pu8RkmK=QQ}Q?(+^;i0rB9&_JNQ6cpo=o2?gd*7KyUp zJkyGYX@wfatkL&d4vna7bsYzr(0a+&Blnb&2IfpVs48E3;4wKi1~Vu3H#0BzfnP3u zKg-rV%Wr4{LQRs+lbo&P1w)_Zyp=l7##^K2=_t;53VV8HS5MQ-6_WA_W9Us)jZ+PN zyh<65A8F@q`|RlF@veXF?ND(&v%+r?>pYQ8vEK4uJJHC6G z3fsDhIVz>{x!7sfs?xs%HVKTpmNr$h`i|xvcwk{Y+wMvGMrB@P5BMf+A3f1KSz2}B z`hEo~#-L)T+j4GgMDYUK8k#A^Tn@8uCoG#Jwt%mUndLiYbY3r-*@@u3OsbV+;YZjB z)>6b7rI%s3gDc-sd36rmALg_>sA^C{(q~;883wD1CXr`3(QM~PZdLMN=K`jzH#*9= zHZm@MfJG6$NF7URYZ3)Zc81%+PhyVlc>%X-*Q$LE+Dl+mdfU`@G^KGtFQn3V;edBO zUx>k3eSJuSMr%-QDP?oszFXz#Q47O$&uGq-^8NSE)E5*-4w_D)fH*R-5+t8FyZ;v_& zVoUS1cz1l61YT%OmD5sm|?-%s_@aP+_6~HI3hKyg$S(t#-c{q0)*yz=crUy!A z8yWoS3{N(Swn>`4`go$#c<&YnjuC7_wuq^CBfXGGo*OD@U|z5nz_2tGD!1j+b-V19 znYO{K_n4Jn_Ml|S)56%5eMFrA!S&LVJexiH9saeocNT27zO~U>u<<5~3kvRC*eP&m z?Ip@zMX?1x^%aAHfeZf>Mv%Nf9ASuMS*RUkC`$+0I+PoI4B)RC>Ojce9y+_WV~EYCmPoJU)} z4Q&nGSZW7g_UuBdu+=M&sFE^ypRd}hSeaBRDJI-nMqpuq28nd>w9&DE?RdA3u^Ejl zV;dzMG`EjE;^W|Beu3FFZI~W*4o`tp;?dfljolFdjxhE4Z5&Bpe|PvSBScR z$%_G7j9#_Q)ZlrusX9zoG%Fov*sW+x6>`ZtTK{sKZ2rMe%AVea3e`WAoJ@LGj@o5H zZ>tmaG^<>@mU?D~e6eP7L1$OX(b&i1NK5sX6@HITjo9>HX>NSG&o8iyeX?YH0)@U+I2AXX zb|;z>)%@FC_N0k7gu1xHEsz~fLY!e2${k9WKxp>fVSRU8CyjP-?NbFUQwXBfs*#2@ z1(uDYa}HazM&rbeIkdMguse~k`#v8Zo*TpKB$k!`;uG7V7QwX^YR&v`OH_YU&GvTY zZ7wRfkT@T*#4=c`op#P}0`_Jc7Tr#Nl_zWGq zwkqDl^}MXMQp*)K4Ri|FbDb;Yh7OF;k7;I_5rUYbFICsoeC1h$7B~tT*%TvRCJMB2 z%y5@P9bcgGV*(V7nZVcB6x^EIc0~y5D{PU`>|BL1(8U6wyG?)U3gJT;ciy>ww5?ey zk1iEhH=-LMk5Qp+(AyI>@bj8w;p6hV;MVcys0jLL5BPGO{Yupe$D1B5x;A*B&IW$v ziau{4xZpaH98S851z$Er)j0chd-T2*$*qnELK6n$-i*yQd7Q3%uCn=hy60&J6sPGMxcA@VuVY&84%8D0 zB-(r8A7kIlEreK07T3+P$UB-mc3hpA50Oa*8gaVHzqy8B0$&K&<0hksp|;jco-F}p zsZJ;S_5oi;uBAKZjdyi+mF`-ha7O7Q^ATMgNz1O7E_{wrNgACqSaoP6t5H-k(55r% zTUC+)f9Y^Kin#SW4QtL%FEA_%{HRmKSgl5>9EN}{?rEFl1{{v-ONRuR>Pm(=h=q6Y9`PSEHzOphE}A!xX7w)Qj0*775IbmQcvi zViuUCTfLZ3sVB??nVI@#Eq}o6<_|P$^?WbQ_dgo2Vtc^PBHEcMW{pp#sPU>8OoOu} z6m+(N9|Z^5?SqXyL$YIQT#*I&=J2BRR%8G_5&3yBosvs??TEA|B{vIw3?)?KCJnCN znzEabP}qj}&1Of??REqKE>nyjRJOlE{g+g|!RH977CD`VuEQFb63?pyY@+ zNRaCv^LI0&C|B1CA}7_xF@03h{Zoz1q8d96{Gq2v{(kh)R1O<)$OY&J*+5G;xX>EjT!?9ZAy zlx+CWbaZnbrBy6_`-=}*iAVlOo*4zF8HagMG`IvTVL8)?7mC1UKWwG|B zHQm}8n%Kr5`BiRIYZ97-gjl79B}e5(wp~hy;j~P-_fIS~K5(eQP+2dB`NV+8jmp_Y z8Z223TOm2f9*10&byYDmO083AY8g4>sEo@rrxr4DOM=bj4lQ=0($yuC-HhZ2N#%4f zu#caB)4HFPgk*AivyHW8K?;%t+3OAEX~X9hj(@1%oNQ=7Q$0gNt@EnYc(0cF#AYGS zA8|QZY>V2MERiHZHnE-5dv)wXp+<-caQL7!BB4MlhRI{&OQ#iqdE@B9bHizd^7_Vt z6(Jer*|cu>)*DO>74KE66G`_nn!XtT!9w##dKPIl3K0+ClC39bDzGHe9HkF`Z|B7f z-ZYpzL@J~%^P96>_2m>}OQ6x(_T$SGd zuC?Wb_ML6CkXYc%(a0T`Su+!8t<^K3+-h6CvzZ^jd^z`YT6A=7<6Lvy{1>vLJFe`! z0utkpyka6+Q~A<@(1SC3&YE9E-<;wDE(-VRp(7o`3xQyF=6uA7n8yGNUAUipgLPil zaCC*E>YV+#Ju}l9wXoJ$Am*B=)hcz;y%37E&hMK!mSL={v-`r`=3hh5qB2Wrd3w_WqfJ=23VvtnGM=C*`tdV%c_6QY^C=ZsA-j3B`=KfqQ_H* zNzT~gzvbclPd{u8gGIz>MYQc^v7*ha5eIq&8I=XhFU#y)*D|icPxfOB4z=IzQz#X% zT)!G!ZG@Owfy00XZK_s3#v=`M3q1B7fjlJP;PsaJQ`nPM8o|gB2^+eFV5_Iu7aN-O z91VA^0vpKyoz_1XRu6)|kP<_*gX}w@6b_wZoI)VIg1A4HHK$@)IDf1H%y?MEXWEV@r7 z(-O9eNOMTB!jE1@XF+Owj|AO*o47vIOv|3xJdmEk5j3sNH9*7x?^ zv)#B`$(uAohyejNGd6#UGM$t$PE}-Fgy3E0)N9V6=a}RFD3-%7b-lCytw?N88#Kr} z=%YTLo_O2J*1lKo+#9~mS-5voVXxrq40!Y)0#OfrpeLFYXaz?4hqZ;q2DAYLbKOH) zM}HO^z+?xyqk*eVt!Pj@l+fwjU4a~Dn4;c2n}#D}e*5v>Ui;*j?~>8(8TJQPT0Ajv z@4d-b#dV`g(8F*&3nb{%=iDl09WQfUf;<*8nA{V=)&1@HHJM5ZP1?xW6kosio_KAE09B>>!&38$t61__z>ra2#3kwQs^o6EMOCb@jD(_A6 zzqE@`Z!I)bstR#6yDkJay?mPRdPcpY{(dN4d(XRhI2Kw{`tYMyEWmwY&mJliVCM_#ZPr&U}wd zEay)`JNdWqwHK&|0D|p=w8OUnf-ULvoYwUJ3`jSiJ_x%r=gMp4~x316N_Gam*8ssK?Ml5J0;p7=UI(7} zdGitI_1WfXw~m>cS2u%;L(<0)Y0E+?ePmGkN4jlkqlXgvx&$iGiCF@V2l;*aH6Np66ovbvNw2{WT8Z|V`xR1hy+qM}ZwZ4D8eJwHjTGwnR;U`0McOXp&n6Jz?t4z4sSGVl0Dt_X} z`DR*A84g|f4XHx-;;(H?6_YP?c=^Fc1LXzK;_M$H=~Lj3{DDiOpJ>@3)~=gV%O$;F zcTBAY>iUDcz~RY83rM|d`~unyc&EsQ`s|M!yUkDCzGm;(J(>@uj)86bO7neA=xgW_ z^x_@CQtz6YBdeP5z(hA}mAR@~6;9d)XJiG(92@i7j&v#}p`@+@8_;aD+F@p{g}waM z2{mLcB{98lEX~3#K|y-e`FmgE@pC6vSMVVwrF|3^ef!w>VMq)QVc*O<|J1v{KEB%Z z)Fpq|D}Z^jI*fKr42{(iYOA12zZ9&TpMy*w0Ofkt_;r&#J-Xv-0v>OW2=!`{Gq%SS zitXMKq79)Mg`6ZYLtEm1Jp|9^?1p5duay+tERb0k{VP(f!bNAFtRwt>hiUQH790JT z%;ctlUUQNw2+0L_7}c7I5l%Eo-hu*J?uxghOOjJma#!?6QLrdlPwn- zL`H*w-|5FJvagJ`nUV~lTdHv)NDkJsD4v9Mn#giC*tEYDj(!>%^x66LdwxhmfiTz7 z!B+SoKW4xq(^|Z!g+nx%Nj$c_9us10+lDHuF!wA=3y8*cg|yRXRCadD0X-L4YLeU^ z_f_muEo;-*Bgs$!SR(peTF z)yQTOEk+7>s#E&W(tGgs{pg=?QM_lC|876fbFC|wng+cs*lfwme-YZG5sAEUj$WVO z2)!bW+4^?AH80R?(`y=*Gp#OrNTpRcA-~TBDYPLK9MTv?Vy<4Q(;B2)u~7svbA6qj ze03n>=|6b>@HTKrTcSyt^!ir0tP6sbOS@!7|568kXS2V2kFv8xt~9XC1d5T%CR&YD z4qHzlSlAq*NlOBU_Zrdma2Rbf0=3PLyIn6WW37H|f?=Q=SE?iKA|qW-E0eRnw+4yg zanXp)D%zl`BO`@CrRNaT$6j1PNQ0H(@3hW9U=Tf z!hPd;_V{6$NGHQJ%YF8sN=%qeU%+M;ll$rj2Uwr-euvb0Sf$6U5Vkha(j_x?PFq2kz@q9?47ZVOvR_-AXH&#~ON0_O+U0!sv zvhrq8`R(ex?%iLt|4x1|(^0ADVt`mifLYN_ zLl9cKLP2Xo?2)dBO}ia3_;x$;i*@Gj)iyz&9M`OqiDZW#zv7<3k^Uj|YMsYrnUnVW7 z<7j{JW$RQDt(?OWJU8dj$7OF`Z0ktf+KSOU0!XrKQE6R##=^{~A0X zl_>va))M|I=hg@j1vsY(Ji|S8S8+!m5d;>u5z^|5;#ldu)H zLJuC37$6BBzgNCuq{Nmh?gGa+PM9SIG4VdUkx1BcZwKqPJXj zzQDf=aH?n|<#sHA;}d&p!E2hko_}-sc`%@LYYH$9Fb{AgL`*sboUhkrG-2Z8>z-M1 zdOWpjy{|i(nj_-~Mi*9)o=ZukJl(`n@pY*}FRV=!uEyj^ToN4dhF zjPhvj^vX;8_(i|`f+Lj$A?e_jGdxe2Gnd`Gjop%wpMM24IdB@vw6V1$(gtW+!HE+2T!Zu6dBt3)bHVLP7fgvDeKk`*Qy6_!eTJ>R z7LZcd-VhROa42;+4FcL1;IQT8SlrH}z%>xN;@+>8^~kCWn4Qc1K-*|wcA>oOzk6L< zzoa4Ee+|H-{!?o9?%aRj|G<#U+u$o7tB%+4+Xia6_M&R0gZA%(7meCFECG*}bo5>~ z%2cveFduZaalxt7=-Z!uHu{z&;M&#p6;~DQz@M@x24;KzM6=ApefBAk_t*mwtBbn& z^BPyp=c_ec)+&@;1jF+*_56@U?ffY1mdf4|;wcJFgMp!^7VK&2dLF5A-%9&Py;bAm z5bX0{5#o<0VQOGW{S*ottZz?%oYZs8fDiz2$KPM#|NKn!$%U^!y9gpXM4k3^q=_bn z#RKYXyAJ$W8gRaWgO)3>{n(9Hu73m0Z@l)+btjFb-LLUYhwjAKlYJ!Q$uZ<}5V!?Z zo?=n|B2SO!`>3CPao#u}nU0N?F+?!D*UUGr|k z6V-2$PSA3lnGw zz@edzr6*id21G6t!E)0Sx^Kubbohs1*eT)*oDz5#e686f5byf01Q63{j>;jk~y3p3H`JQXgPE`HUSmy z6~10r@S4#IdfT6-9;ZVmuhBu^p@)P)mejRe|1_fI^Mm`eVY$pT`Ad4>pN98ur6edD z(Letm7?{4&1AW?pRDq)RA=GoEJE44d#i}EkX!pb3;NcX)T#+OVLd;Hl@w zIA7qb_4t-*3jNJ0D}%?Z>}S1nPP3g&O7oZJKJhiT)~qPEbG5~3kAI44>zN#lRMhSLKw4V2ypB9WEZDOJ6maxOmN=kLhyyJw z-8Zv{uWl76S88Jn5u)*62K6EpgWXhArJ}>R+8igzOQhX<@7EvTnOt3vAeRM5LQC9@ zSz~02tA2i=ph{!Pw_*KM3!S`Y`>p{7m9GgAs)Wood`$_2C6>SXR8i4ogB*D9R&5fi zt&ls@W@TCJ(ge*8(^_OQl0=FwmP(T}n3hN)mck+p^0|!)1viZOlyYDqTQ9SvySLZZ zt^2>e{@(h_6*oVU+R|Z;8L0>Ir^HbT1l32y;vj*G*6-9*PlsrtY*s}if~Zs~Xo`qc zszx#7=!ZzrMhQ(riAhvUYE}6StDjQ0jIsPExAM=zg1eR6@UgP}87iuT8k(g@42FgM z%A%D@LGMWC!m{u*QB*Es5F?Vv!Vm;5rBRLb6a$@3F&fF#sFO@F>V{bhW_%8TSXEnF zMI_(==}Y?8Zj-M;Fcm^VUCn3=iP`@&hXMc2B0YC5SYxd!n9hxr;-KJXhzdRWwVp@+ z^5h3y?t21?FAeR^>A|%iV5f<6NlL&nYw#k|jsbYgZjJaNIs^h6Ga!&O}u!1SiQ<#uYVUA4UOR>OXxtcbC zHCn&X;UI6a#o7cKWn8Rp&QbX>xhgjz(fBR&VW)#W?1wZ8Z!lXQ#ANDp9j)@`|AwwE zHjnIsCKsE5U;{Mv>0|jEtx78~$oj$yi+lr#Mq`2{^vUy&tdc?H+qKO!p;J`Q6_7zx zXh-{jJi0+95qX4)=z?O$mPXJy`%d4bRm)baU)?wGjc5QttWE@hkb%kdh7Ln}pP|D5 zNGqXrQ1;3%I-T&w`h4_dfw59Xqlt|bf|uw@qSN>G_d@m3=yJb=3qWsbxsA)D=TO|_sB3zjz!s6OH zV;L{$^k~GPn}p&|PCW%3_tQG=?DqoiHh329-i2McwVz3N?wEcCj}^|r-69~}gHJo7 zot=rq5J-C`KsNU_XXnaRvNZoe`1Lcrt;^qeL*pK-JCJFpEN*bQuz8y$s*ZJQq$1hRApLlkbTV zn9QMp=u3ZK2pWU)-@8QmK-97I%7LF}Z%>#DtZb z7tV46`Zy!Q%qqXiQn?mrEE+A%C!^wfirST-F=;>n{AZu`k3Lzp?d&HkFfg986c)f8 zIdIPT93YQAn^O?zC#e7FoZsJGgMvrKGk1lM%%h>$x^IyS-!AwSgo+_>@INnegE6u~ z>B-Hvv|vXOdvDE7&(~F&kc8c4GV@v^5*1Zn6+x+)A<0leqt?^Zc!JwX&;E?6F?kWY z!UryRT6h9(kH@dox&0ouJD`=N~7%@YONt?g9I&gaSUlD})pXzUU*{5}jAUr)jK*H}OJ)*qC=7rmSFql+C*lpnxFYg z^nIlZgg&u)mFLI~cSfJ}t4T@W|1D+wlbo(~W@;$8hdKXM zpUs`=-YWj-__yoESRXmUf&vbx22 zuU`8q(|{LcjD6YgbCu&UX5&Wn$)>r`;~8+hP~Gs}6KW%QXn3_p1Z59g*mnWg!KLmP zlwROQF1s=9LbFRcY@D*y=6zhvI&=MEll|7dl-rm4@jT^Iw1(cuXMHvxw`zkXJgF96 zj5h@>T0m`%`l6nYRILId?ysU)RJU4?zm`2)`9GTce{A_Tgj;NYVoO#5>PLibc=%$2 zQvta}UoTNVUW-4>meiZ|V3U%hBcVGqVf(P3)QYmd3GA}TxX)s9vBlZ1u9K`wFJe7QtStxBfK%2Y5j55X0Q<{`N%67r;&p1s zx~@c^qH_UtJm3oe>K&HeeDIM8CO&2zoWPH~S6cQ=e*S6JxuB{t?F&-HrDCx89cq4U zrB!-um|1@=E9<}SK-$1h`jK96J{CEF01`>_ZQp7z*iSdLrwQ!+h}rH@x8t{TR$f5r zM9j%fA%>p~Yx@Lw!%K30qGavr;q|;yK<%!rx&iL|1^@Z}zvpuDba}i&A+h-1e}9Ao zu)o*x3d$qX+%DDPUm0$(!R9SGl)Z88VAwLK=P$&q?iP2do#7=L^0MyaE!o}O0*)3n zuV!xYkbf0gg?zSe;K6ioTJ93w%M}v{C0XctC`=g`;K400AQf4;hpk}n&u`H-nJ+py ze04#vsqkcR(R5+qbW!oi!&C!@KoB`la=NtmOi9U^;?mQ=@6agPini`N8cdP&KjVP& zUggiO|J`h^B@YgNRm=5Vxqn14Ew=7o zC?;7Nc7m2N@MFsTE84l{8|hrgai)SvQmB~p`oW#d({=7r_DQ1yPpm+I##LlEtG!U^ z#D3t3j2HkJ7X4?m?R5(diL*Am^Tb)`PAo2cu;X4T7gr9#9tb0jjs}-zoP|a38mYuV zH*pX_9C##7usDIecFB$d>3ZzMxxzmzz)pK@(bVn73cXNKN)EDtzil3ZhKUD9 z$GGUEk#zS!E!K-um$@fA>3UOd>c2y4`yG;qwY1RDh2sDe3!&LwdLU^Ucd z9c=D?`!OkoLfdmTAs+^Di?|_27+RBwFi1Qy=G#s^O~Le7O`)Pfki4f76PL)X>el5*I{&^N&Ng70+i=}rAh_zwG6+q+`G z(riuSVbGIHN=B_fjc-KbC&BB!?V>!OtXqxQp2kAxMWPIQY}C@yX7@vpWZwDsF>=>p z_z=8MRohW*8F5P?$|5Z;v*;DbatO5ca0qb7w~Z`40uD9IZSgU zFRQT|g-cBv0T6`{qCX^U_0wrM2QvDR+?;W^*TwP4t3U3*tEcyZ!M+-CaPU%hX&X8=c!#S_+~vj*P-2_crFRaoklWlU z*R0zucaIdh90*ii-2qZa^8<#GZySjY#X2jwxReb74l`VM+mg93!aXf;VV2uhp=y_Q zo=3l-{Q`7SOKsOmu>)`eu{|EelvH=_$IN-6Q%a5or1bO_5)J+ok1ki~kp>rrw>AhY z^S(Lh6`9R8*2E6cZfWV!Uevhl!djtBDY#Ijl&e+6jk~dUMzuH5;2n?dQ0b8d7l!vU z2rToyInl}mQ0`w9mxoX>I;vuI95?R30IZ{8Bvr-bR&?`)F;AY7RKd+&F)hF7ej{?( znpLm(A}5Xf0@j4x!i5yYwCT@xqW2hm^E$E;_76Ix=pfgCHfn=vFU~G~Z+NSDb=cZv zk276T|1x_j^B9>rE;hBCQ-HntdFP;rs+EO6NI;H&$a92+*tFtNG;61D#hbUfqDPINNZlWl-w`clG_Mo}T3md8-xSkhG`XMbDEH zi>`9IUyFD8JaTg-muvi;vXIR8o@zQdt?h+J-CbGU-|?69b0lwnZ&K@?9F)#SGP4g4 za*56_*ZBoEzqEsXjol;T_1s~$wcDY?Z82r3(Evw4xWCZD?t&*f{qp=8Y8cwh^wzWJWee(+qSs3>pM*o)NIBp&d?MhV`Z@7J{$Ye4;nqCQ zja%d&$p67F`5)edAG!Z51+q-8pJ$b7P_k-_ik@$9?E#{5z3kz8BH~f?RzvqGF_r$w zHG4-rA43#OMD=-zJ;axhUbAa%;SZB&_uj}|id!C29K}-t5+AaIQYe+uC|zcNGU*Ck zrCO>JX5YkWy*>M}J^im>;4P>A(U|p@tF8D2dy4D5-T%Br!_VRex$C?FY~H#T<~O^$^~6b8U3rW6ksI`s@z5dmZ(^ zHyt@kH$ZE}+qIKC)pp2~-W(ww;TDaGO4c7VzqBW!a-VOp4~9;nA?f=_k?^wx$`o)4 zKH)v+0z4T*9Sjtqs3&@erOWtM`vmm9CncUJ_l#$C_?&aqdA1FUUacL_==!|A{kSn*Is6PuYh7(j83=Wlui27z zQnw->ssR%%Mu+JmYvS$)Z9meYXh(ChS=7ds3>eZWUWQeBGnvSB%CfhG?5^$Q_@;)a zRSXecfgQ9}EN5lqk?7eZoGC3Q!k)9XwMLqCP$@|tJUJP zYv0^@kio~$**rWFnuz3UHSsugLU$R0}x~J=Hc*w~H9upULmzzN^&iOJds%hyQaCdoz9U`@m_MEH&bc3mM}F@ zQHb>;%!+bKCE`_6zYGa& zUlai5%#tiEuwZppB--WJsJRELum=gM8iZ$HZP%2UR)KuS4o*^=xXnGdYu&g@svp%r zXx*94EI6|o(MYr-1|t=9SQ!Ha=EB-rHy;D>ZjTJ-ung2$NoAYyxR)OQF>SrFXxQXD zhZm?gQ_St>@sRjzK|6WPWVxPjQs-|V`w@<5UWL+Zzw-pD3$q@11>kdTqPt}M)hyNxBx~{- zQNpS&sm>wGwlJjV?ou;_?L5e)5$lPuQ^Qm0IKu{P*?97p$#4&}JB?bfeu6TsQQL>J zfgfx78ud-l)d+~!d}&LyiiuL|^39^h^WpmD&P(*j0z_>GcUQ-9dzy3H%1k|zz}y*R zX+y-NFn7$dt~YQiqtjRmX9ms@spZ^XqZ@k@cC_;u+FC2mxP+lFqY6aJt~3g$l-5|1 zgj1&)1?rz2VZFko!I;Sp*ThGPF zxI{-`?idrR4BVx`*;orlnsY=}4(9t_RU+troBsM|iJjzpw|1GP|#2R+ooNz&` z9A$2}XVhE2LnA4P^400I(Jm*Q>#R(9i?^)xmDJqrhKeromi0F`FTXH2`|5XF-s=6? zaypsUzMzs)!@2U5v(NTYcA0{H167qc;+djg8!x8l{<^ujZBP$WQXCoXt?7 zBerI&YBp&Hup&M=J|!#RB8i!n8R80!;vW}=A-tfe~04N`2BvpK|<=&d)|hMo@S;!!Hw7WltWzLuktOn*LsEe-5bj$UAE*}(gIoT z_3)|lr!3-3O_T0xQtLX=GcXbrhKL00EBQ7r{k-lguY)kdJT! zCy)375`$jC=q8#Sog6L6v7Aap-cD26;>g2%5i)v`3cv=0!98GB9j#ZYylU%mZ8T@w z*#(|)nJ%;1aIxW=8-B3i=No>be4%`={HFY4@r}jL7Vj?Z@1Sj#wb&;6pT`TG7n;7k z=|`J>x#@SRm#c~6k9OyK-`xDiGvCh~%)Oi2%I)k!JxVZyKsX|tZTW_l?{E3JmIo(h z2Pay8ryn^F7>R)@Ri&PEyzQ^*;K}2%4+R}4EWjSPAqfo_!x2Cbpa=uNpohj2F5&@t zv4>lHk2IPA2w#IcBtt!n!i#2UAfb8MBR3_fPGdR}NIbdJr5Xw3(_Y+hjusc=e*B7G z)7#QxYNlyAr+;ZY4MGSN;zB9V`+1UI*uhF=y?~Aen(4rak0B-r@=KA8h9P)2oC)Vu zQFW_o>VtmTulsF(lFZ)6KH>>qw@STcKho!rlLjS>J#L=(jo18^NjA92Q*N=w9SJds z(qv6eBq(`l%R>G{l0f!!>#VLzdY~RnYeOe`Qb+|YYFDeusi$LWo6Ky<4$Ws7YuUo? zMwn)G8(Y&9i=1A3;8Wl93y*r)2fp(U&bZ|rUpnKK_jBbgkMmsK%buLhuaYWKeim=3 zmQi^Yt$ZwA8P~2a^|KahuUZY7?Mpjp{+4dtwrnpO(nhV>Z}z2r+b`YfY4<7;Mm2-M zXl2YW=QDGeA?8WubIc3OcUT3i1nU^ znxs&h<{OsP^3S{Ykze`35OZvClgHd(fgSD%kt~at%*(c1NLYTPB@@9#6j%DJ2ce-Q zPzAIM+5qi>9)g~PUW6V%W6*o(uO-q_W+7TQ7KH_}cq~y%r)9vh$?~~X!K$!swtj1) z*{rr>w#%>~ycOOF?}ML(-+|x5|LjzIy^~yQ5D4i(jv}X!>&Tn~<0x>5 z9grjFXm>p9_}aPJdBpj+^PKZ_=ZN#2^Ph|4s&cisHo1`rCpiiQ&q66q# zblzg!1h?7U!iM`BJ_%cdpN-^4)N*BP1cJdg^dUPyg0AI{+XW4L{#E_phe zerws*<2Cz5fN9Ha*r?aVZd*|mzxb@D~U2EMMspD#_7S+BAs;6>Q ziX#YcEMNytL{UT=bDRN!g^316U|^s(WMm;xX%jS+aW z$%$izO&k}CBjYweEQsM-A!plhaWs(=SDg2B0SS?$H1bxkuOUD2l*Qs~qI2=Ucu%X3 z#@fpK5(1M1j$8*Po(!&p7GH`#2ZgZmk`-M3Zr^p+weh@q6Z6Z4mrSdxyRQV@t0SKP=Uu=1zkQ++8n}k2 z6`1WSTvbTCfs;a67HUms(Z)y(>?6j)PctY-791qcFU#4{S4Ts(cq)0&Etk&DKr z@5|(Ae^6=7 znJN%+{wFGC?`z*x4mhh?gK!QHFj76}mkjRK-evsx9VLUlrp+0I%D8!L?4O_S^HQ8`p zE#MKQOy37NOnQC>MW=DbXpTyWqSSg5(Q?K*i0Ni%nXQWIgn{+Mx6gPHN7{6l!Z=Gx z2_Z6Apl%z;#_-Q=MbHUjGCuzAYZz7<_Wqs;=yqw`T25Wc-&{aWgm zU#R7OLaYEepzd6$d;9+!-lq&g(80Clu0N6*x49pUp5e8qkghm$?XWx#Z#o`nJnxK1jypD0IPu6 zMYO$tNP4R^nEUNTE5K+NKXNnoGFZ2^b2Jzpq$*%e^;DZ6ylQB_uA}g8%cufZbf>_3UnWnP zI1r0#6WxfX&wG3p35ssZK*1ds6WMVt(Ly{tK}UT;bDWL)kjps%v)o~Tv5Qqb0)5am zw&sFHEU%sUxU4zI2yyzjG#@6i{8_=I%!f89ir8pcFM(vq@tU9Khl+=avDrxIW?=fh zoIYhdix`+WGcXbtpsrlS<*SovCNMPkLLdqiOpg;N)b(#3DF1t%930y~n^64S`QsaF zga5uHZ;le@kkA22P6-AqW9L|9txknYr8TR$Qju)~`GaeKe~^CN>Dv9{tgGxA1o>8e zSnx|k6>agx0|;B2Df)JM3$P6t*-SsMhE#A63i{ly*t)3R+GNZf3;T$2+Re*q7cwLDb-AMEyj5`Q?9lcxSY~ASD;5~PjQB$Q7>Z!Uc0IkTox`EmvmV?g{rAwSrBcl}qnXn7epTO)nGzgA4Euw+EDNk2^jPN!3- z&ap9MVp>flIuq_XoJ7pV&`{k2GB9AACa8f9j691<83tx@AS1F?IAWXoi7E-sirfjo zX0k9#kSV~aXcDF(v6D?tr%t~<$l#=?iL$~;PQ$tlJB`ZithBY;a(_6XNhk}#RG&(1 zX+gy<~HYem&_E@YfEvTla=XUERv{`HP^-Pd=4wT2DlMEzwL>btod;1EP{sjF> zr&ooMT981nwO;ub>~K1T5EOI8z?g|xHKVZYM;G~s8#+>ZMQx6$rmUfZwZcdx6wr*d zX!nc*jalbQ{Ld>&x*a%2?&6roD{zo|yxF)C%7Tk?0jf7q^SxDavw6?3-+91FV_0hy zvfG9A{cZ(4VKVCx{%bvnv;5?CnVsdzuLr6#_LYyziHr2B3J-#j8X_KvPLUz#g0795 ze4Z8dLEK6t*-S|lV>AN(%M7!q{(MgEZyDAHgCn_!bdm!tqmZ&8`k|mVZ;AQTi~F$B zN&{Omm{}+zo3@f)GURM&!l8}Y(kheUJU#S`wO>!3U?TKc+rp*H=AIc)6)Ofr<;RKA zXcna)&hYpT(iA?k@co_+FlU!LSa6~T>!1!TmBzNI5}V#FGqS`j^88_jiZ|hOKfF8n z6Novk%}H%7!BLv(NpN7&?kW2jT5_&8`Xd646SGUl2+aH%P8i*v5dU;u3+trYAu*l? zD5s<(O@mpKj#IceIAg;{_y^cHaTy!9FFQ9Uq83{~0;JTyqP+kvfC~cpJHoX5-R-I# z6`OVljY5^p1JH!MRH2`BO#-IpMATcf;n@L}3jyiECTr?*WzB^O0S|g2N4mZkb~)3@ zFZi2~a@Ndp-wV2#w8w|qNK>f8n0RI7Pec>$e4&cu0lr)to^y)Vym7B~G%g~54s?$> z*rWlh*K~2mUt2sO_JYHrr+uuBi*8i@m*%QOXLs&%+TZgBg3)bD;Yl6{bjxAkzVPoI z&nni71G~XOV;K&xejHnO|2Uy;EoFv{+{VlioF$H3+vcCVAa!UqHy(%||LQMfPrlzo zLjlcEhP+|i^0pT^elN;;glT`u$luRJLPy^m`9*W}M@I*gsEH!{;<~t~?Tas8p{Uu# zV>U4q(v1 zH-`nJ8{HD1MiSjcpv@mV#}a3lANR|MN@IC`iKuR=3~+LVrd#b0;|E<_B}P0XSYU=d zC~c1P4AO@R_BdexxvVkBj)=O>vHW>)f?2#3)x^sY=7a19ONKQMjY0CV#@4dti%Kzn zwE}I$uQJmE5~j<`VBnknlzNR(Tr+r^jn!3#!t}tP(z^&nB8nDttsvP%2_pkIk1Se7 zngzn3Nropg_}{Z`&}Ge?L~3iOGJEM}D&4sIMAaRf_i0d^NP!Tu1_2pC4vM?jMfk1G z%Js_i-&-CmTfu(P1pRRa-U!^6*PjwDg7rhw2K?2?{UA8?dHvAR!ywtxr=RZA7I#ca z=uhL_FB|iJx6v^8PSD9&*e2nL+li4;$kLq%!HJ*pSBdEiD%GTZp_5_6&zSHP?499D zkNTbS^V^`$0SPexX(-tAxbEZ&WKA7h!994KmB+6_R*ll4b?{9Sl;icF5$3OveJMEk zF^&VL#o6OJ9k`W5~C6=K?qhiu>l7>vaj!R{U|16LX7 z!94Yx=Jk^xZ>y5~B?9CP@!8b=rsbp$-vX=kft>ymx1(^Gwz^8H{uT#!wt5AHGd1W> z`>S|OYLlt-`qnUYfLy6lKp@R9rYbfuf>Q7Hd7uD|^X=V1Vgg(uV8Ays0Sv5yRgGCz zOy@>!+AKAXrA%^l3l?NbXi48vsz#ieB=Qs_QZLP}SSI@2|8%6BtQ-*^5cy5|<~sNaPv zM?V#kxRQo%qq|lah2O8CU+*8{be!7MzIlsjBqSeVL5th>CS+ijf!+zRqP}wVCz+uK z^%69VZ)*2f6VHchxb5t>ddqp%;o0gT)QUGJNqsmlAn#`Wq`jgqY!&y-okYGwlw6(0 zn!yVaa5N?t?5n^qeURj7+B;;nFG@nosVf?T9Y+u({4oizbx~9%k<2QF-Wr>)>jWlj zGZ4UAk${_}H`ByZ6s4BIIDah09h|{+M{q5sX9!@(P0V-(6DMl0zp2nLR+RZhg_2nn z7|zs?KV6xI->-4sTjU`~n?*c$;?x2C9qwxMW)Ja!pAD3r@R2vD?*{d4b zFNFQv#W{z~JTymY0;7~mClz!GFIlcAJ{TMXbjvO$2u#6dsa}!9AE0{zWQa+EXj&DW zen%`RZFuC+y=2$v?}tMY>Fd?VI<~A9(-VD?29i3PMQ^W&8wKI zDoNa2u;3DYt8wX$xW9mZ?>EK+D99lPAxv7zK0T3O_tP~@aH?mwTRcdA(X91B_=7s# zTDq8qu}iG95}RgY{yM>?EH7~nkOLK=a;&n@#06WLjomv%1}EaPE8<{}SnvZyJS<<8 zF}`XEr6}eAGV)mt8AHPA>p$x=*8`)!vaFO6qc*+^i zO_({G0*o@O=M{7cE@?ehyM+|9VPgnE?J9GiDMC;=r9Vby9i5mja6^yWY{Dx0;~rd1ZE%!FRYsJCSE51`WM+oNrn6E$NwK>vR0{ zHyuxfv)1TEkdP6^YVBhSPfxTk=%>9CV%b#hfUBV)2|`tvKhuim{&LbKNX%&N9qo%X zn$RF|j)|-{Frg0CdzSCeXtkQb2dc)Ix*}OnbT4HTphJzU7D)Y$&5ZUap`+R=iDsDe zp%z$#WnmEAMUmyLE}t34=5592c1r>$L6BM}XNA3P^@LKUCcxTTB4vBjCq@oaa*Gfu z2MY4IdI_$}*~UOOSOc~0!fvfY%&MNe3)MU?WaA0SjcT#foO`C*fhEY(9o{bGmpfn{ z1dAp{l3QnU!685y+Y!9zg+%ND-SSrBd;r)GvtNHiQx?EBGC-=LQ-LF&!SOPdfmts& z@H{CyO8SeXSpsY`5>Yh@Cmp`p?urg2W&5ga=Au;3BkKW@25h)>8!*Jj3)z_abc?- zmKf$~Td9ydA2`d&WHLhMHrkHDDSrno<6gj7ylbS;+r7M6v;wxmZffgb>2PuoCZ3n! zk)bGY;i4k3AZV^B*om2f;dqR;rh+B0197y^m&^dfo|KzFJXwJz5k>!JyO8@BD29gG ziewl4!!{UDo&Isr;~By;QY2sTyzrC`VAGFu=L9RD+V4b^oz*_-PGXcqanNte7s}Ee;+xC--Hyh%?un1k} zhSiJN{6}?%Sn8iwd`eG5iD7bHy;GU{uDi`vY4lByYcK<`gj%!s7Cv`UBoZ2}V$s{t zJJ`~&kffQ=#OM`3hNqTX&U~CQlw!C*^SDM&pe>Wup(d6qa%ki-P)V(oLqXdx(|Wja z;HonFNVU?Kd!Zvfn4!#g*>EgRBFSmjr2;Ee5fQ3M?YB!ZT9hVFi)GD8TOT#&$Rarl zDQNRW#wk4yot(pNbx^aXV4#W-5#=0T&Jo*hFd|Q1iQjFATgFJ6$v=#osiCEbp4N>D zkN!-Rnl2i(S;{=~YLOkc>Oe>`pd}6LEOn;bh*jKTV3ZE;(S|tA@U#L!8{FcI9C>QZ z*oQCr9yHsxzM;&y_%VWgplrc8jk;8e)wKNw5W9nF1vBPMsHrSs$OIf-$G^cFF9I;> z23BM-3!+FCKfy$g$bbyKDTBn3j*NVlTt@<~xU-UC8FICBQE)hT^OiT{;NWukUw1Xg zMo~3r_E9&`A|W?h-w{*QsCjG-IVdFVu&MoNr?3V8w95Vyf^bgYIRQB6nJ^vtCZQ~eB(Uecu?d)@L@DaU(Czig+|0)(6(AVV6rRu31czg@ zCS=z~%cBKmk%J0y38#l4d^~>ua&8+gF8$8#Li>iNqYX9czUzzzwG)BASEAV|NSCc5 z=<>Zz*+VENEtTeWtTQ+PSHPu{5y^FUlZGHEu!;n3V*zz(iKr$ac7C-{>Tcw;;SqY= zrgJpWX7xv2Pfj_+FQ5GPi3Zl(wU=!0^gOJIaXRhrJ~1uk$Og6Zex%dtu)^qH=DE#k zL(p~^`SQWRPPl56OaYsxWV;`cV2(WcMal&->R-~;QaFa=HV4pO@W8KMGC}wi6$rux z7$Vyep&xzZ$(}#(*L;85)w32l{zO1*cDKb-itHo(=R~0fXkZM_5P9wNoyrJ7#PMwl z0HK^H7mNEh&#xq^ZIf&TG=JEpi%SZ?(JrWI7`jCI$CWFN^5YsMqVp41YK>jyml2IG zr2#r3*+h#s6}Ts-TI$kskk$3sDCy6L|C}OlBbXt}m1uF93Hn?l`yg}L%a(>ArOdZ{ zjIDZ>>@~O|6!|La^H|QV@v}5q(}C~m4{js=-cogbiSru$1&xXeP}i<*+b`aoa^Mm= zjs(tLR6erQ$;VftPp`BIr!4S|I?%k&{#-YJ(Ce9gut?Ko1))w8INP5yK9HO?r(vZ^ z>m-?Zg%Vj_PqSErwZsmO(QPxlI^-a4hqJC(9ps7|>v0YT{sgb<8iGexSqIhOM8toU zTF@M1@CEVaKdC5q=$K@RnkW48c&M?J2BA$AolEP8(>poD0-HAAl5Y zracj{4l=x+M2Zt8GV$d)qzvMzr>=)qv#)Fh%ulcmmZd3T`N-m6G>E#rki$Nh*4h~x*{Cb0I z5Q46$xV4ssrotx6S-beIM=d87{z|>M%TMs=@?iPc%F%UpO8yMk#6>{u@Ud`QgDRPk z`nJ7_mzlMsMPzmvi{PH9S|Tcq$^1awTuKV{&`PaAr0R%4Tkl?@Q}sCSg5h>3cA;Rk ztK;5NYd~#DY@tuSyNQ_^5@z7SbhY70$k;qW1!bWQO?lMykH$=u2`e_II|zveDIJgv%uHK3 zN`3R7DIl{aPWh@Wr~tCTafAZl*YFz28U(=$txXoN8s*c1Rj)L2&~tKB1k7)!O28lT zdIMf%G(~xI0@SWHQlQW6tISAim>(Iv3N>hUyot&sqS#Ve*aLf+2ksje5$_d^e#!xo;L#*NKvZ-&Q79Iaj7q{Y-}5_x23q6H zvb}c9j}NpH#h&_LkQ^eW5Ue9+;*6asJn~XB{8+er?NmU7+iF^VAm*m+;Kryj?A^n&lWA!AC_3B31^lnM*`9D-~J1n-;~P9M}#!(9A8Q z1e4;KLJ$?SB}im&aOfPk11mR2Hyt6Yn&P1kt`QPV6ke7y-jLX-6_>pcq_9}#(RG|y zY6mmFwiOWiMPe!&*And>PxFdFzSRkHoIFvD0=3zUG^|Zhnj8xChN2x%zrPVe0mpwHpH{oAPrwJAl%unayhfh4rl&-0C=aM8<}HP%J)5q-eok@YgeD}zac4A&l9qi z+}6D}RqE14YPkn*)uN43KL`Ntq^F?K;R5@7rfzNU&n7S&O>3*>9JnfJe7|OrExhoT zF!;6+)-(G!Dh6X*pes{LgzVsi;zcvacInb=qbK#--!acBj6#F(^9??m;JlM}FjTS* zKAH*Xq<_Pb2|wgxe3rjEj|vH%xQS4u`t7s&ji*dZg(|OJ+>Am+IA$3<874tWea+of zE;*#+B$ zENwC^@0TErzLcdh4G3954(cCnE&Pg>KDr4d4Gxzrsy}Te|9KbAa~h~W(SgB_=8sI9 zsjmdtKJ&y`VGs5SugTj1HUpP^y%m+50${___;322^fHb4;Gva@1TtC%m|URF zqn#6t`X<`kg`02sLTYkbd*)wx8F=1N)Bmd4e(QIsl`W}wHS2v}jF%!B14FZ~Q9{Y2 zV9(n;=URgSveW_h6&5tj_oEizl!j~XyWJdO!)Dm(u~8uNyvJFlVDsDkH{qoU>?J+%@Qji%w7gGNSPk()A1~q}f?h;Y7ooUFX5< z@}{+LoPc@;9qE1?aE7dT<(k=XYe(<$ZrwO6LBUWf&nFJSJ*{s9fIz3F>ZF< z+0u}c&pGEkCjCLI&C(L{tKqKEr1JV0F5AN-pYy&N$i-R-i3?S6vxyy9<>>B+Ar^AH z$|QY?B1K<-fJ%@$r7E-og^*SSOJg2}i6LZ>sQ969&~N#PyieghsT|y~?}^z1lkUFG zZProd14@fs3>kdp3|oBzxpbr@Kw+tjz&4&E5uzgK(WmE;O2q0eVmU?+8UqvpF*MW# zN$8K^p(ev5Adss&N4T1_&_A2s1T}UsZ1FOlp*lDD5Ci|e0Q{vdDH6(^gH+5F>tno1 zCg<}cKi$DR&Tt7Olk%J)5BH{Gz{8(b5Q8U;LZ|CL`8Jv2fm{X@L{COQbMiYS3esp7 z!?t^TR#I|a(P-rIJEV7)q@BkOR(wmz$gm>&ASFFx3?4rZT>x46ph++%$>I1)bwzZq z?cCyWoORb6?84r092&Zg!{5PW?SpVWoHr{rsMGZM;D!}m8b?t#k_a`hVD=QxzzwPRDZQh8E1(a! zjXq0P#MFeywK}_2L#AOsYOE(xas}a38CVe~?ZGByAu-hSQX1!!$vm&M2 zfpO||Z-0LVf}``HZNO9X&O?(SfD{>s<@y+Haz`$5;G@eEDcadcryS7fA$MT$Ry)))MSnmW1r06n&cnB5jVmf4Y%~R29 zV&zc)vH9+;v*{I}rK@?>{XMjx1eMGSdYIBa?z_3Ds%y&cV1-wjU(n<(j5ys>Dxi9v zMsv;I-rQT!R&@*Mu~blHE%Jqh2=e8Zared>H&EAx2ryg;X0uiaOO}x8z^6_b3}*Bq zU&X3MXS#b-{LhltCqbq4v%{}1|LWTOB!-dk%5R_kzUF@sJU6@xz&bDjVDKv31+UVY z_i}fP{&MlhNBvc}ju+j8CvxEcWgfEY6Rx`4=*FIy!6LGwF*>l*F@Cs~l?61e5*d=g z8;vF-vIPTdGMl1|oR)>PzR=MfJS|-f=b7EXhWYW4`N;M{&=z5X>t z@93i#wP(-8L{rmP4B3XXjh6pQjpeR=23}C_94$o!80$nV_qUsbWn$|?WUdsK$4`50 zs2@9fbVB&MfWFpv@!E0Yx&@ksA~5X-VVITt`?HsB8((q%`<0n_s|}*nJ#!q)f&(Nm zF$~dB&Hej}ZqauJ_!l{wmtOGrGbp(IOV>;&yK78?8~c~DvyVe=HkTbU5d|)3{pY}2!fN+!x z!R8``z-|HT;5@F*fw)g_+T?$Bo)^nm>@7Wpp0k@TwbYj&)s8Tbbqu0lI_B;NoXZybbZCp$4#$h0NNXgARTbn!N=GN|R!#yC6 z8oLMSxe6~NM*282l!@jQ;B2Td4%`pjYz-NL!jH)HDX_S)q<**&YlMOn4S9#`6iBD6 zDC5rtWbgS)CzrmM;pLv)xx7PxTn&o=lOe+d#HOhDxDd=W*WKqWIMMfZZ&|jDfP^)n zvGI9S&RVMAFpDM4OVW4f6nO_d(36@!wqDu#{CBn-m(c(Xe-2i{N>=>{orsERwlfj* zAD7ZKud{Og{2+cSDp;73*xVwlWn|b{Er<}*yLBI0ZHSSE_^r@4-4zqwZn*UBA+TP$ z|Cewxv6b*T7V*Ryt7lxDm_P<(7{MS4G$)XE%UYg$N`PwA)MVz>V5@htPgK+H8|oLO zmRtIlWe_d#`qQ-J-WFwGM8=J=Ll8};2~sCpY;@S1=+%rI0n3{z?dZ0ekLJ9ub@kmh zzSwf>%lwJ3JpJk2+N&iV-a9a*#R`H0Q~gX+Td?x1`2RZymqZ^8JuRnL496n!Hi0m01hstwmfNcSSxSE{hK{>PeB4g)K9$9iF4-;Ri<7_vP{V5!Iz14tP)< z4zBF6_6{zKqOo#XA(C0MQs3N7Bk@)By6SdMo<9%3bb22a_JBRQPU{t11I0FltYna+ zG}|GUgQK?Jvj@xFm(pt0ZPX?Y)jw!isH8ev3aVq#gd~-Y`^m9VT^I43nKJZG-$+lv zttCtzpED@#QV^> z%C7<@kc7g&nm{>jm_lSdGz`F?MO7^oc;YS~87`s^go3JMt~GiqpkKM6=b!Rb&4qol z6L!6TEA5kIW@~^`tnsIRiF?;0Ps2LP*%F`;nvGRg^%RTGpEnsZEFxi==*EwPC()+X z#*^mOJ^E>QEG=iBw!g?M*p?O@$g#zPGws@*4x(Ehpak0y4BOiJylJ!?luE(!cdxH& zBXfJr4`XnxeLv{()GJ-br?AlAv(-3SYQ8a^sYwa*@ zUC^8!)%XAwf{Wj3Wk0YIba0zvt?vyP3}fsBj8%o=Oc5zArJ=qbR~T1GMW>Aa-l5@A zid-yvTR&+1bEz@ZI0C9UW8>h(GhC~kaX|YP%AgF@>CcMgjmkY~ETMH@ zgVFK$2IQOCN|MV7lI;vz6p{)qzB|`jiE2vzM#OKze+AT> zStL^RY*1tl-rx4#|8!n2BIv7)Yr9_sW)QfZyFSk`cN9g}c-AJ}yhyX>t*Y6vk+RNc zHL0GK6J)gn+un9;-qe(*$?u=DxWzVGJ7zvcwi+!BmipaZSY-}t+>nLs^;g++3E{Cu zb!m-QtQwh}2BA&cqL(u)?<}Z}dvlgd)+N+xE4;$n_3nm#v~63*&k_G0Rl~ypOrYVK z<0%Hu%JtH8_(s8AG~1U-LS9a>q;-8aBpUwfB|aVL;s^`TW|6j^FzpbhbVl+O){z1E zD!$|AO*ct{7VNrf8sZSw>))H|GAIm&vN09cte)@At@w<%$ z_3s@nl*?}|&_|2dx1aQD{m)B!1YUPJ7z6>rnV3Y@2>nplx(eG2&szo4*`Jz8yxSZ7P|zaPq+0m;Texjd z)PL;?!m~eX+=4*-`db)fMe$P%A^pBI(k%|p@~XiJ80xiNUFZO%BL~j zHx9NN2}ZjpFqXQhm)k-A=?eRcOJ`Swf61I%7}Z70R^O7bQfD8t`XyEd3%Y$d-!Ed= z=w=Mo0$N(t6~vfOkSs0 zUToE>TD*=mwZR-?sxRp;O%W1T&8?>ub1WvMkp6?d zq6d<5Vwo5SCn$1g88uN0UB$B{nj#okF|aCb(Hb1VAY!xHo$jU%2|*3N^8fi_2=$@1 z*4Cg7B>BGt67c6s9FP&Pi#Txz_`;LRB*eHICIj>A%#i^_OJLhzMaR@LRf zIp*-=|6R0SYP_rg9Q}l5pR8LFNHED@>|IYqE{(~hH`88zZ(jT^p86xen==3^P{r@M z;c859<{vs2tQ&ZfVrU`@?s1FEdpH>M04tDaJ`K4q2d|N;Wtz!VEJ-4;aj=-#mMMNe zbzz@(hT|C$eMAov&*X<9pisG*&1MTzO1)XU*$G>I3HvTw2SR|*A+ijO+iX7c=w;Zw z+HAtddofY}+aA6>`s@XdK0D!pzMQ}xE~LuANw|%VaaAAVmcj`GW_qr>xIg|Q46+Jx z1XRePQYP9-=?gnxCu{3&!m!I#!|sTXW8U}KuEk8yyjshWX<<@T3AIwGj;i;wR_e6X zv%Vwseb;CY`;^gXP0cbIg}CE&^S-hwO^>&-iI{#!B?sptk%@2Hcrsy*)|S69@`!+K zXRCsIsZ2MmIW)o? zrQZeL;GGNhPNJWMVto7h`+xOb9{`>M2LY&kBVkrP>nlr2;LH*dOJSe_YkkD7W)A)C z(=bj33RZX!G?)i%W=Nut6@$Z|Kv1 zyXCfVM&mxneK_;aJsfEr+2HlW>-n)h-Tmlo@^yRoq_`$YkdX%otxiaDQ;^U=r(OnxL!i6eepM4(5K$aHk1_^jCK0+Vr&OK2-1HVGmBcbf$Xu#8W{<$pXJ^7t|*?n}q_d0^3HASR}OWVe5^N)phlor29ce)nNiCQyI!i z!SC$Kd-bh5P}ZmrhUL|8Y4>hO140tC-+b2iwlX;XEZMjU4Y=g7ps}W_!)q}}>fXl{ z=1H8`Kk`mksbliqS2yoiZvvquz?R&6!ic6dJX5mBu0W$I>y9UC`=IaAh>tR44l_R> z-+`B6X{-F-FU_Y;a34X0(yTkIky^*}96grfnKw!rP5M$jgH02N$Zf||N7jp#nDhCh zhpb~bN?Rdm!f>g7Z{SJRFM&9A!BKkxk;c*m5!yODNY76j)6aBaFbFrad%%M#*yu~r zJV3UTF*z(CTT+e2iycj#o^`}2br8h5IfH&Yv#Wj(@9&OyqHC{pKBV2?^cn~mjQKlv zTenl0BH;`ap-neR0{4za#PkU1B&8rl3PsN(?YhQJgD~Vq$?Pg-Zoy&oj=W~ti#lSZ zSq<5&h{Nngp}@{$6;v!OMCr)7#Rx(T)ENtXcD)$JUcJrecs;c29LsC%7g;DjY;F#& zOP$FhM=jZK)LdWhBy9N1ItWo=Lg>>+V;f+N9zeHm?%RaslO{Z__NkzrdNzo6?Zu$3 zJ<^?Hn-_$7nG<%_0ojiu_Xpoq$kVvtx}qOuApJj`Ul44N6BluN&voL(>jhn( zm0J4VY*O-F&C8~4Iqtw(utuaFnl-ij(|#RSI0+t&YEd0{Sp!Qi9|;jx*e7A{w2y^j zC)hLF%K%M4vcE6IRB*4rcPr7-*tzRuDInVP6k+A07)22Y6NfL0(zCNu zq@GzT^BLa##f!o*e*N6w(z?8+K;p1BcJ>q&{E-rcQDO`^{>ol_NcoHbtT*NcUjK83 zg5%+2cS_4;+aeVQo##Ok46QTft?GUQq<-VP(n1!hatsCieeXoFyuezP|L)-&6Z4Tm z2`oFu;3N79ilL+5Av6ZRtA8V}#*nJiu@RC{eh*%s$0EwCEURylm~A$hg8)rJou7rq zj=@;wiq$d-;F#l}L|qWWmDHP-H5h>iS`()Hw0=4Ku4D5Xe&6UACG9{cbT*7@uuIP- zTAAK=hd#8z=390(Rq&qhKbWR4D-t(HrC8Y9%!2FqGX8FTUitR+JaeTE9A`Mrs3%vE z0d2?OurOLyL3%DIv{h(KFMW>VgI~en5+3GM%FAqCYi9VZBn&zoJ> zQ8|*7qxtgXo*OQ(R9Z_Bax*8}ye{+OmuAv`$8*F*kx0Oa59Los0$Qax!R)!tilQcG z*t@;{oWi@)YV`3URA|1M(2p!pdY>+jRpuuJ%wPqYo2JZ>#3tBy`0&wm9cx$2Mg`@8 zrK3ze+Ux#EE|m!SccC2MKly>UXR0@=f8!p2Y*>5fy#qnWe>Nk*@^N6Vf*8qv@C|$U zJO;t@R$L!i)6gEstHX8B4Wd8hdvM}BOiomzojyvXa?|6os))O%J=f!|YapXaOT?sa zg48-_knW<;Cb->e2cV`g>JX{n#5#JKeXn(`OE23xT?_PJcAba3sl8=NU+AOK@!^-2 zX7sff(Vt$#{w|Loe~|~>Trl*9dxuofk7q;9GYxZbe;pKE?8u8D$_|uM@=vk~_f2*9 zD@q|B@{Mx#!LB!&^Rx^BHPEuGTNFRm+Y`D#v@pQ!Lk{2a?6VV);c5R9C`9X zRm`qXN$1NeH;>S$FGIg*YUkcm8)Tz!)28iv{7-KDHYl18PTU^xuno2i*@^J4tMl0h zo8qj9*Q)&g-S2KR)hm36e1Yd=#cUZLLc%0xdrD7UivUIv8H7M&{)sM^&yVBAAi;ix z89P;cpZj11Oq z)q|zu^ecG-yCi_w`Z@=H)_@a(b2QaUr2goSE8SrjHdj_YoE9>}f;x^%V;g0vZb$Ry z*AZfIq6nD$Hv=HRA4ky~ps;7BhUb%BwL{No$ z&sn~GB7nF1Jn1Xvz=l``g%@v24A)2h<@H71E>i_gi$RN2%LraW&&OlpBAgA^lKS%T0f=o-3;xarY>xnx2Q<4Yy(@na(43`J>Rbc%ze=z00NGi`1%EV;BGIBn#Knfr4p z14XquGA=R2J~=VU;%rb{e-WNz_%MpnoCwBgW!*HI0~(|$1TyKXw13#hCG6}?Lo1(p z+~KZ)Fal0b0&YL~;#UQa``Sm!C$58nYjE?A5JW>Ob8Y739Cf*2?zwhCskp``jT!vd z!%A3v36v0ok<{Q=_Oj@kBRRInuI>g^qo^W~b&-9BLLfx5Lz( zW~p^ixli9BIb`Yoh#>D(TzSU~lM`KFUPox2!5$;>5Jn@u8=6nLw>ye#^tRsinK2m7 zMduCTV8oj#JCh=5VcD0=Y?@&;lA11K)P;G)mbMM0V`X&~OPT8cSMtbetZj!QgxkH4 z9=V%@78guyofP!u`o3c}#B^vh?U#{*ZFosFHi#_m-M1^wFsbNg=8S)9*}@bsRC2xA%uVhOHn z7Oh~m;@lzv(p@n0=GM@*ijgbkCTJ%G|^1$9(mTSjVVX#CDA$YC)u=Kh%$ zVK%AV)0)?{y(QQC&U1$sf_4<;nm!Uzz2-P{yrsfxiOYhnd27^aJ*$jt!AByh6Rirq zj4)H|$ao0Ffd7qW$Y7y}b2zoGCI{0Ap~@6sk6X-~FCf)B-Ifadm8mJ+v>~3)?eNav zv}??*l1o1(%A2!=7ehF_39NzY5KmVYjw4hLVdhLoR&i5yz)qUG1gEFd1uHHjrTOOdDQ}>)z=SQ}$OYx&ynj;zu|1q=gJhkD zQm@noK;#q-2%R!+y=V#4$0}FB*w?%4OMe@InTp(=wcmFaGV`= zu#jz{VM)vLy~E(ZI#bp4Q_%`*CSN$}nI-VcZKSY3A&Y>9@$s4yl|oIjUTVI7L|~rj zi@4s0n!WWM}I&-rSVAs(+t3o)kXmn7*6uf9pqL#JqjXANN=NgU7XLeR1Z0z8G8AONj zSv?75OPB)jBXJx8``^R)buN&!h37j1=`-pOS>O?;=Tek;h33Z;_iTDaNhe}dJ{0pA zSD@@T*}m?}38ZWPFXA(Bk#qc9maw~NsR+D3W)!(Jt>f+ z5Qqs$Ju77}@W38cO<8ugQgHzqn1S9Vu(((sum`f0R|@wPz7vs5)yXGRC?5L0(Z39v zaLB+c%eXO9!{&@0H%Nw!$Ddp=HN{aN_Ur_}&FE$q<5XdTIpG%jDI>7kCcE5IPrq)}Kp#~C2GMm|2H4-B?#S(d!m5Ppa7g1%&pbMq-HD1O3;iBZlG z0cT)Mec9pijKgFRb%6gU(jm>`5@PONWp5FIH8w6SPZ6GeL0M0-!$C~M!uBi8HW@r(8s*RD_7NV$97!u2c1@FPV+ zSkPuoJ}v3}yhv^Q>Lik|i0*z4B|3L+t};S;0vdfv1Krhs1_c8e=r=;yXb2^6mMt0! ztnIKPMhLEIK^K(GgW!=Fh)I!1KW+@JULM-0xiRjhQ9E)Xg1K(Y9>sgB%isM3E1>Ec zJQC@X#`vl}BE0j;q%1Qv;!UJVF2UF??_$@%g zse+*@o?<^JN(4i3;8O`u6apPe3Mxtn(>%5(<_lH>F-p`4o^T3|s1XZgXdJnC)^wvr7zrB+jeyXMlV#rw(>2DE|-Mi3z|tRqg?J_TV)m zaFura)&S1q_f8e%KHy|nWgms=k0G*T=3#zsurZaLP)7b|hglAm3I`es8*YY&z3IxCspbFhN8rTo(xM zyYQF2-b%M>A>#-2URj_Q7@$zKVvz+N@{oemx6n|35^lH1{aEtwSYRxRwVR5;M;XZmYA8~5|4zvT8IIGtDVQ`UIwrPSQy5J z;2tOx5ity``j&GL!~h>r4jj%^gjGX=6{rSQ5CZf&P{TslXhlP1q8W=W?FK!a0WnKv zK;N{A*eA}Qk&;L`@-xs8K&6aF80M)P94(NMW?0dUr&$$xMJPM8Vhd4{BOddBa~DR| zGL6TNS%YB&Fz_9;^*rj!Ov5&Auu^LG$uvHZ zSX2$n<#e;qSvXedDi3h{AO~?{1w$m7KxWn!qvj>s{sfBPrbY-NJQ$=)x*lr!JUHK`$~%o{5v?V)<-xd!GE8(w#hD8AaWJ;j zB0Zf&tJ;vbO>iFMd<(`ASyRy#(au!HuD-Z}XZ_v~FX}9sZ|fexDeD{w?(1G-zz0}x+x+!f&5=&sFZ!zE$hsG0H+~i8cPr2?q6V?rScXbsf9zDf!40P(|Ippli zODAh<1;sRU5M~v><6wcEK$K1)-nJSynrScS3 z6o$5Jh10+*jmwNP)Kv_cayibagLm{OK1fI#N~0JJR#aUnGEbOra$#*Wr7F72keO=O zUi#M931I;ne39l*G{q9)E_x{eqBzZ4Sg1Q3TAV)CvA#GUqnH&g}7Y9*=D6&%{hOha+m6iS*c8-G&>oj(NL9j-w)c8)=?%-Wp>|d?~$2_sI-NX zot=}yC=UfZjj zzG|iZNBDfBnv|;BL>xb292=j`Tjusc{Lv61iiysh)*}6^>*mr;=0mFUXa`{y3y8s$xd&;o$K@{|)l2_pt-gFZlO3O!A0`I6wg%Bz&?8HK%$J z^6>;@Uo~J2?Xi8Fy#f70Jml0sC#R}Wl!y?g5f<#jVh6RN6;I$j-o^@^o2dKB;KwD^Yw2$+Gqz_nV>WB#d>SBKAh0d_EhE zs2OT9sNT!E*TlBe4m)7`Be97`mOYTh`GP5PJQ!RGRopVWv3Y)Kw@*c0C|4d7j-97M zmCG@@JB@I7D28F}!*2lO^J|=;owEzg+E}!`oURuVvm3%11OL-{>QYi9&GUYya?#?!4fsJj1%xDR1HR z9)RHjio!zDBfGM>OHEizdY5EOZaYb`07E1LXs2|LCtlJWc zkS)#ttoSGce{}zKcl$nmmnRGoQh>Y`FDv-w9g$K%fn)P70+Nm7IM*^ON>T{X_9ROk0g#fv>P`vccq5x_ZUWmiL&;eah3 zEd(&{s`*ept!874?V7(8eEj~0p7*+iJ9e;Uv^*L;MD#6xn!)Lsqm8i__2uH-R?{04 zZ%}*mR*mADeHBkIB?KX*i4^lvYa=BvM~PvaQe-9vf~pXuI$P5~D7zZJEQZ!1)wRp6 zTW;^kTKXRGy`$xb#z^6go4oko{s&xu#+YllZhthH|cKM|9;(JpF5Eijj1vcwsk27C4S|`0WmN$F+V< zV*`_&-)$tDw99AycEOUztwO0ZFiHR3D);mHed4w${ zToFo)>5v8ac6LL{lRc5i5AM0_vRBdjcuSnN85f12_(kBRUpsq&KUK>|p#_iNowLUG zl`v#=<3Ql`pC0&!%8!HqOqJg}fVar?V4`+C_qq|01*k#YYjs!;r#Q7SdjV)T$XLgI zgEw#rmGUGs#C`=c%DFPKJIko(e@wsF1O!65yKBL|>!0oXwI7V45a`_HH^m?jiqjK( zn;dMUYDC*CFy}DRscx=s{|pqM*xnvUIY`VF;JDJUUhULVaPd`&+VQj$%x&5jZe=gi z-YtoQJ_k%kj`9Gx)~YuC>iWC!fsh6Zhz$hoBodUmrJ*So1rBoWa={^xy{FUyd0a}Q zjBuPD<3LYOKEt|Z8x(X-cLW^hH4VnfUE6)21f!MQnhLt5?#TPcb?$_3Ixdkj7<>~7 zJr|ywjel3{9zAzE&tp~t9(Q@h5=BZ7zDnS6#E_;JN=dX{mY!e|O8*1=Vh!N{%8}d~ zeS|t-Dm=r~>zZ-h4?&Ga^~yg#vY>z^c77|Q$lph?{}sJ!cPb0{wI8y7>e|o-ZPYr< zrn%vYDzcm`RC{_)@>lXCJxLRrt{#x44TGK9|VVDfO-oE)p9YC`}Ih_&e5-v&baG=>wU*>bx z=F3TZSahoexgbKOXo>BLY)S?+3e!wrR1AlDCt{^_f)YR-ue(O6gcIU@oh`TMY>g5JZ&?$qJ z-Ys6r^#!<+T?@UN*%{007A#>Mza;q5`~q3RjX5fpFiue5JFp;NP-!=6LB+QyK-xmh zq_P%D^+p%wZ0HQ?ogSo@t5~jZEW6(xn;>IM+ZQ3Cm<@4J2Q+c0#8`8p0bv-8d4h;3 zH>`#o6KBnLn84bs!A>4y@`j_LVzuAGO+Q4p+ItFV&YN6xa4vl>Z`L}c87 z8Jap;nso4$+R_X003WsBvW#0P>58tYV#-2MG_<_?G|DV>tKu_#0Nc=!^xB_e;i>vup|_;lar(=2|O0Xn5}Hvx!St-@atg~4U7~xxu7P` z8)!AG>0#*;EAuA2Je^IBvJxzT(;kOpv(@8q5-#CIp~Xk>gsSwb$DkcA;t4!}m*>nu z#p<#Tj5pUc1XsXrsNunB|GJ%tLEuy01PGy!nEW^;-fA41$fV$X$5xA|HW>~AF>Lwa zvu~&wYwyK&JUt~!MfQK5R(oy&ER$R0%yE?cv8;J;(ERv~?Z2DC6-VIHs)&@tji{t! zn-{@lwW9gMCAgs4#`uGr!;A0|dpwmu@XIx2JYB9L2(t|e!TpYg29O-1slXqj@@dxE zm7qs6RN~V8ciKW0beT!oSnHr*8&h&F#H5A^q_0Di@OrP_pmMbYL;;uJ7DgMK1azT= z>9OmO^B^gJ%Bi+m*AzlGdjfVA2vxuWr7kiqx3Z=BgvRKZ2~iUR{9D6iE}Q)T>!TB{ zxZ~P0y9(|}Ab3nyY!RGHr!^sDuPu2>+Grw12~X`dLYOxWW(6a)i*%L+4$tE_Lq*5P zszSK*dph3WN#RRPj-S!&sO&L(fmwnx*+8^8UoOyOI>=V%ZKGTfHeCAQMJUGx8VtV- zVFa1wB@yW(duCWic(C5=^_n($x6z0K=S!*U{UVY)oN#ibQk^K0s zj8%-`=&VB95!nnBD`e7NTkzw|0NyrO#7rh_BW+_UwlCdtW|gW+=h%8ya0G-~d%>L1 zwx#rmkp;Hn)MnX>FdfD;0d#y`qE-=EH|zKvNhKwYa4hK%(sG&qXYVA;k_Gg!osbm6Z^y#K2MU5zg`r4sc&Ru{bS!r1cNVi{c;T_#nGNMmVVoHX zbs>WOM>p&~+cI9_Ld5zwXM4|54k5%j_U6=ju9?ZRv+D)rHwvv1?(SpUr_F(`l&OnD zUOJqfu!pW0n0@zb?zp+8)*aqrVNp9=mS%HCQiVKq&!q}t4(*f-7zABnPobS36$3=JdW&-SbF?v|8CnT(hvaT7N=5MXR4kSt~-U_i-^Ke*?p)vDL*D(AQoHyQ`Wx-zW zs~8aJOobwql9a4~l=S##Qf+;EB5$Wh#Z__v?DCsi7vWHWx1V&)v2LY@&J>r|r!vrY z5khGeb=B0ROXxbFW4|5m;ZL7F1*=uUERu#AI1+e?Wv=&u7?o3 zT;_S^+*I>eB!racJ~@bQi- z_c}XVSO-Xr9@^c-m8qFce=e}=99}4E8UIH8Y`$*1>+Zl}H8zuHXD0oFGhpoNNoae@ zfox5@(*=~^%bqD^;FPE4ONF+?7V1=Tjy2{=noWjdIfiD(%9?fkK}J<9lyz}c1yftl zd7mYEQw>`#c(!AE9+V5gz2P)s2QDN!%DCEl$kNYIfgk?KwvG#^c>pu zAarttn^Q8VpwV<49%Hw>Ccs~Ty3efE#I)% zBjl1-@g1GVq`vuCMF5zMcbnL;^0LcqF%FK%y0HkE}sPbgr|qqXQ1{YPQsx@%4c;DdO6f`1sD z5+`b%G;5QDHF4v?1Vltpm(Lioyx#vxu-dsB)|(oEp)YYD)r~kE>}Tu`z+niNZNp4_ zo}kqQC7Sa>lZZj1snQPYxJ`sF@{iP(gqnHOt@(GfKEx&J+lHT(GApnY2++ug%;P63 z=3-nSXk=GW9tMNW!#PWT6M0Myjt}FM6`r{hu4i`YxN7@^s)@_2<9QcH&Xm%^O0R7& z2YJbz)*nfVpq4B(chkYbn+~Abi6uf44i4cs;ftU`JLMu!4&}q7@5M&tT$zJsY1X!RYuweQ zH_suouza(UpiTG)l3H@yhg(lq*wY1n_!b?nT6S&J@%|s zM|o6~DV!YE_@`(4{5`Hs1HsLy@0*BqkYA6`AyJutu^aX*OzQa+VEo~>if zm}kz7;aMRrVW?aj>hm8V+cXGcH|L!(p;Mu;aUcDgps6O(tt$h4qSCz(ahR~uIBT3% z77ge(BKiP>(o?5;^0+XaJpwb|a4K*Z0`wIKkuh`4W}%qG=qEJzJ2JAx^@II=40`Z` z_5a`{+KU|hEAW9nbDUr|wRYQ?BoBrO2WILFq#`5yDbGBdfblFh^&5c)eFG#`kiqxk z9eRp@sO!^XxYMK5WVGF$z!0Md%V0N*>+?8^@1+M-{NB#a+NE|@@x0_Tz;3x-bVm4x zbvQFz4{H{=EJ36a!Mq?qM(k3zPwpS3X&eZ+chwHNku%&p_oCXHJd{4!|B~Tpx|mT2 z@~WMEK33a4nsLM2jwtEQ?)e-qqlC zo>{v7`IW+#VE1L9Vc{yty%DT%(zT(dGIkwL-y0oNE|AktEvBuE^t`^2 zUm{4$PtyFa`yaqso0qQ!S5oW|g2h1lrQ9F~H|00@7l-~OkW8WDNcA5C7xv>2%>1Fa z@^E)C5z_H6bluYF@0qyX*i|?dn;Ic^~662KW^#buuR+?iY&`( zPJ%1B=r1hpU$~a615+@X6wrv~G5i=_-syhV&HNXD4qWgwr=th`I^bIKwu0Rvf+(Oa zeGg6{kSOe}!u;0fwmVZQXl}@jI;7_dSrKH%jTw5YAw$c+9Lx|6ZV|=v$S&^(?kyM| z*yE)nj0o;#gO21hn{kLYPz!T3<1s7`E)ED$XHu(l?E^y!uwQg)7qbx2Apps-QsBJEns2q^Niw#h@gGl)RSGIWC z0la|ZV2S`tNTr;8?xcY0W)(VBPD{bol&&|g?40L%yJ!Ipr-ZRcaSMM{;~co}c+SY- z@ZYiqxmvBRgSpwwi<|{w2lwvZHFejX`eY=rkO-GO+CMB$wjCsT8*lW z`6tO#^e~;u<1v$iDOL+XFFXi!n#PJR(FAEh^05r^n@VuWVZi(@!*oD1yFnX==q}6o z6VheJa!{Bn0!02ImTO?#`@ssxC{>x)n=5Ufe3AKeANtm|z$&B^w z|2GNYbJTQkGvwXh0wCW(8x}QLPT}71hiUA6pqWKXoP9C0;Wcvp7z!5s4b&JiTv!R> zQVOT|KdsGED2Q>Y2X4jM6M>%gT9USRx9)*>Y2Xq$pj+Y_pl5COHfUPF1Xgd7vi}$Q z`1a0p#jy&yDO>fbIJ_as(zYRSg)3lRap6u(PMokMW7c$HoSGVowLiJ_ zfx26-JoiVwAh@kRxbm7?!Pi`Q*|~GS><#t5uf@HhGfkO7q6}}y=G^*}y4Z`Cf^}4- z&_slxg$0c7sbS?xT%j@l!y_n&VW5H~Ov-$L2T~|u!4z@aH_KgIHt7^@n$6K>O=+^; z0C{mf2{XzFNCtzLf#d~G%mB##&yz)caM$zQON&r>2oZb^WX{;$4biw?DzQ!+ry9#@ z|M0*Ax1M{_2`S1auVMW%rYF8co<9GlmmX%-xQ$!ii}aVE=b^1)&|~II4B)1OggbH8 zeb&@A2PxJy)=yZg+J--TPhG>B6J`xVKXy1gN-P=64P7s+G=^vWWEe}{o$I##1aFgk zXOMtog-QReJl&*xHR-I8q*70TH+edK@cASdIjxzGoCi&==$y4425=;{X~lIx=8ycJ z6pokA-v3|Q^ZwyM9Fr9f{iI_)?92jcTZXi3wUjs zA(}f3RF)rAYTAWRCwku>fd$P5dLSKk-Y#+xUGg}{tY5HMFAZB3ieMNTO!7FCp8P)v z&n7KZk7@4AVEjfi_hdd_6K8>ckM1wg_x0lIx|lZ^htW`p3P~lXK?#=v(v@LHn4U%+c~#p)sN7Nblr`0^Zmj1 z0;M6H*1!G!&M^*LkWE#hBEHu;Kfb25^m)8dWq*u>R)~OIGUV`vDA)a|uIF{!|NN^| zenn4k84fjhM53&`CmG~>GAoWol>-f8ML3_noZYM5vhAy#;|Gm1` zQtU0t#85N#2@`+eLHxg`QV``cU9#~Z6I{eY9>C#DEF<%dI~ru z%SX@o(&yu)sovWHMbkZBg*WDzjjITMy4La8glv+MLw7;iwDJC4H)wZV9%7PS72U-5lWDOlQAw*EA(h9Y&Pk) z+gB45gkl`I0+76{+h8IzbxIuNDR2@^sNAHBt>P_$Bxa>b2D|>W;ifiNyao0FLVyUA zxukUTp226v!9{FL@+}>%g5hRf?A^mO{kq9BKzjac1O-eI7~aKLc5idj@YaHBGF z6et@eyw;Bux6EbD^s$#Iyb3BB-%72Mr$O2ZeeSm)rTxeM29|ZdUP-~{f!%|zm@aLl z1Gn^r$}7A{U&ZoHqI>S0TPjf2``d4Vbf_LSyVXh`yt`}H?VkzpmwxYdJR&Y$y)=Cz zfOjbizUg>|tmWpfLg8v~>s7nWwsK0eXKh^<_m*=I@Md^jVKN90>L1m#0D}Fb=Nsg= z?!Z%&SB@#$-ki{2xEJK?zutzo(O35g{qpd1E>MX1b-RH!9>RdXVx|`pj)Lk8f49ip zar`sn+4G2!Z}o@me*^1&a6s^!v2O^7 z=bn<`a2X{@?$+^PBk_suUgn9WE6Dsdi@%;;cpmvTqcbn}dB*4i45}fZR06z{?1PJ6aO%H*{<*TZE1ZA0``vV#eLpk4e1QVU z^}Um&#l?)-bb7Yc3zxy$^}X+3(Ef7IaB;>j$GyWo>7Lp2aBg~WwESrA`%@h~pZrI8 z_T9h!t$!?`?`OvOmm27s^##nq$V+Wl7CVurD59HAU&}nAFUNW*j+1Oq9 zjAt32rvQYGPfMD<#$ycCLO5(28p8bztiEc+nm6$8B8IJkj*OCUp?|eSt-`dyS%y+h zOku4879Qs;f-CV$ngK*@C*q<;&`em(NlL;wZn^U7V{T!Y#)vDg;>rf+ZqXFEFn|f@ zJ5Z8;2K#~XDvi06s;cm7<<*1FE_lJ~%>W&pv8pJ}z}s4;-Hfi}=VDCRz*sZh(Ox+2 zn<%$xC|1&>Ke~vEESoS;zHjQq^h^xGIT%82fKN@FDbVp8yMai`3*jVt7BFYKFe^C= z5r}?(Z)w>7Q8CY1rin1y)cA-OJkJ$4_^AxcN&f-bL^Sk>fWFgXCKW|T(JA_d35P!T zlV`Lus^nh8`b&rS;TOAj_Zx*bwYv4o*TnhZvo)^5hP7TFUi}8@xx{U(_txUP`&>A# z_O1sIAOTD7ro_dmx}rVXx~I?1kno@a97rz;;ZbH+7l(YmTm;8L9Y9^DEH8 z>icT^I0xymmTDa+J}i3TH}Mi&g;#3G@BXiFjJ(Fu**okm=M%r9a4*vx$$R&;yvsc; zes>qu*rGXs5DeN+fO3adE!KYtCq2952y_&}e_NfQIR=6*`hN0ATTSfdv6tizp z`?YTHY%lNePn!UlcP2MDMZN#9R;t0twZf4%cQyN)qzh>ymBOd^v^|cIw2s&{+KCJ@ zdx}>xi#*vOm_d<>$(6Q1Wwp7fq1DhpUe|-JFXyt^f%QrSSpb-R-XthObh_X;r|XYc z`hKvWNYAEvhf>eDRRzu}1QC=VQu-}xVzMTRs16BRukkvc=@P(X_n z$f~ZZWuW7@p4VuoNUdQxQItzycpfblP=Z8@OKbunoZ-WKOH|}ttk#GHfzB~)_2r)Q zGqs5F=s-iTqiK5vo0?O0O;OCAGKw4FsdIbr>eoVNx0Zb`m0!Sw76_$MV9gU6OGt?< zto@Aa6q~AJCRKRvWIk|Zu@Vr^d(qK83$)(nO2Tbl3aUAjG z<7!pY7Ouz`sjl!#rGtkTF*T6x6Q+XQd}yf?3>T1+~m`)ZtR^qYfv|5Bqu+)ZK$h%v$qaMee!rN;_knJ872fMt494 z8D^S%Wc!z^XZ0RyA3ZdkcG^Z1H{Rq?juD2dF49p@X-z0%TVX&-SxHHnNTI5c^hlbK zRa|mYA_8evE*X|FoX-!_kn6{Y{INJf>Z=p`6ssY6?W%`TD7CAH`}U#?yN}|Cda-#) zR%DW}kV;j|Nm^q^j$Ck@8=q*H@erQ^1Z`3847*inY3q1G4 zu%jV&I?tixU< zkCK!_Yi97Y0R+d}r@3)^&Wx?SZ2xIyHGpL83*wboup&r%-Ly1$*(<<=EwH6-$3z(8 z6jn`9E#r{9m+Q`s5!d?_(79kG;ne4=>s772(CVSR8*&Q+^&}oW>WgQxa zB6>2Lw*Vzicp6D&b=8GBtJXKx)gm0ncBzSgj?<~*px%zIyXE@|X4yVV@cQ!zAwKiy z_a@7h3cqnp8!_6h;J)u35rm5bG5Z?1T@Hc^A_gWr&_>Jq!qcXco$CwL>5;NG5N`hK zZQLb(nN4^Cs-UKQ)+*PrVxhZe9Yw39!jJCLMSJdRXO-8?XeF!*Y#zzVWwmg|>!X3T z{MX!fm>CYyEz9nRwsU0XB@{0H$?}5_co|le)V1b9MXm#K6opWiyICprsYk!pX;PJS z-8NL7rjul$s){TL;v=h6qA5!YxyqnB(yr$bO~rEyG53PnuXh5^(j>dm?)CdTE7xny zcI}=L6lpLF31GLGJsP6P4$JTbJA&Kcv`=&LVowZK0<;7&Rw=T|Q6h$)9(jkiwEzV(>Fb$@2nzkYZBz{eT z&6??p%_K@mpj!Oi#Vn)(94H^fAg$(WsZMy_G{VyvRxDx*$6-(edYBp0@+mf$@>LP5 ze%Kd>NzW%bX{&!V9X=F)Feb91kN8KUt=I;-M%{n@-G1k=M&Q3JvG_?^;GJbl#2>Of z|9%lBvcx38h#^8J5`;xe6KXL|6p3Eq3gN?glM_0(mz(u>6tlNxf^(8!_cz^);=+p0Q-}<{Xvfr{Oc?IyZ`#p~Dp7?`LE}1WM zHNN(!1xy431pNCZnB#YQ#QpD>(%HBD->mj=U2y9lY`9+gLBylkUgzyyC*ai!Xm>vc zuZ{iM8-!Q6gm%He^IHVCnYmOkL}jt$0I%5<;pWpFK*d}@2$lju%CByIuC`)vn<032 zTUTVa6cf3Dm%6Xf{>KeeJ!%wQb?Bc4O!%CW#syVobgR@`gYy^`e3?39J?Tpg19eg6 z`(uOKCJT6=>db;wV~cNSyjs1H)A3D{`g=8Y*bsq~CO-yfD6KE8GEd{q7i{FfG|S36 zWZm_j#WXwM=ICi>cf&i^>I&)iVbrpkWU66p9m3^t@O1jW!m*Z_zS+;PI4E)n0z`N8 zu+svd+Uz&lJfemc-;pwJ=Qq)A*MSpip%2lDFdDRBF|Z#@`XbunsGq6_a%Z_rW5Q}x z0}o){h%uH0M)f|z1U@T%KvpcFmWkFBca<{c^;{b}cy^vZl#MEaMWK{{aGJmbRj;8P2LBnh)Jdzn0DzG{waoEUV`Q1R~m0A>*q27O)#0Di$>1M(b!K736s(8f&}Z z9gu|%4a{&trqJ;9Fq;tAnwfE|4xD-!UBUnk$>s?zGW79klWJ}|B&z5^YZ%7Zl!=Nd zY^JTybb?O(AERB8rK*1PF-&VO1{s1|?JTQ!EK^B1)k7T1W&yiGa8ukSA^3YB$Y_6N z6qc77GAJAXL(l>x(CgKYphoLwGhAO}CPYW96$oaWL+=@#mUbA9Y?Csfs25swt}SuJ z3Aa8GirTyvCyMSF_E30fFY zwx$Oe2tg~vHb&dhdA12E6*}#uXyZ(G3RS46r5=k^(E}H%V1Z0I_n?)H)IWG)n&lIv zwZT3Zty8-?baWRHFm1Bzz)Q8Y@Uq)Ap5Vqt*FxZ(Stf3NIA+k$i+XGI5X<`fb3S-Tc!W`q}-p5qZAty@U z?(CoFkiWz<@FtMqeJF)}5Q)k=c=*QNWIlhh&O_*0?Q4br*c&)P&2}6FPbQ>ol4M=1 zyL=m&oEZ*=8N+OyU$x0j{y(QNB@h8m_l>}R1|lF}ClV4aq7d*T(a`ul!?Xkktb>W8 z*$m=QWrUFkG-;B^EdpyXNr$+Z!E}V+NS&v5a-xjRkj&>Tr?M4LUaxt{AOo9HZH`Z zRf4L`)LqRx9&Ca?B&nm_XTbD+nTpLNr82e24+Pp%D`aAQlpYJ;{O62qO{m*sDw|RcihbVXt7# zM?D1=@*0lOvf3JJ(XhzRo$f_x(;0 z4^k{crYy9>mV-{X^3V-mfkLL}(T_;658gnz>^QOLwr`$m~#;OYLT<`spuoxhw3m-<7U%z}2pCt%DAcz1I$rYsMO9 zya^_f#+5+Mb`MxOz|Q7zNKA0HD2Fw{LHUKii>F*`to1(-9Ga)BW?h~U!5ixx&9fW_ zY4bQC9>>H4G8@GP8~tLK?Z83vu#)$~CIe17?F>BDEKX6OC9oC`8%eNP#xc#-7F@F^ z-u_zH%7-7aS(LNB9AD}9MFqa4o*|FFO51#3$3EX1b{>b^VaDGqy2QgA^NUeqF4H9- zzjK!h`N5sh#MnHMZ&(p3dlGQA-LR|EF(0_#d1sNP_+p~4!(;H$dFa;Q)&a?1(xh2S z6f%PyI&~4u8kXtPk1pF_$gq|%YV;E{Q0U0EghbUxvSD)!Y z%1#w+bXt6Ct}&UzuJgX%-q{^q813y}ymXM}jBGF7@|9RT(V9%PwL@V|f4?VMlz0qx zpnVKx^Qj-&a$Q)U{gl%mK7M3w_elmsB2rSxlvN!Z8Xg%PgK5iXhbyKC&fap$qE!Qb^d5_N$lZJQA3StXE0UjudO@yWVY;Tp z`)+wvrvuE|Q=`oXZnD4$V7VFk2y2e(rH$Whe0kINnqJw=@}n)yXOFc)5 z{X2f-;WuzFaI>B6>sYrt-s$|w?svNY-rk+hc*x=y_Uq!KeIKW)6Io2 zl8I!Bf`W3cgt5;jzTWV9>iTJ@zF5npd%M=$u+F!pVRE+0kBG)$^n?J;*YIIP&)=Fx zd+YDWbE|y9>(-G36uHBZ`t;)9aN7A@2njsk^n8$1lg@i_6IhvhILtUpG4^FQ(`VNN ze&09859(INFc8A@P@-I4KhDeRyDKM^W~w)es!J6%XHogL)*wE9r-Nru^Vt$=mp08;Q_$&FCXWdpUZ9q@=LY2tByU8V$ zM`9;t>tpUu*D5X_p~MwwYsIfa!A}oMe>qwjl6SjL+MA@X8r)k&v|jM4xp#5j9pBBS zD@iSHr?j~98#S)|o_^3Bfmht>_UC9UPyK5tD=>qf2btyAk>{H{k$|uG)6YytwT~`dcTQJo~vTuDba>kc)l#`RC5BE}8Z@@s5Kl2pSX!4@L!J zg(4yY`+WxhooP(*8gcDFg970v@p?H?623$1Q>P5oSDZ1-pu87AM0hY1>%{iaBEw@v%f*d_Ro?J@ z>Oy!}i683Ah*87mWEa5`^Mn}~=QL>1LRKQtcHA^eoX?gZ_N?i_kX2`i0K z54|{nP&_0Iem;R8l~2oRAlRPF3~znfZBHh04I^VUG{_4J3{ZyQF=-M;K+Xgf>h+Tn%H2XJpZa2RBwXy? zpEyC>)izkODLY1hAKtPnWLZO);=@~UB~~#+CJtjYB1`VPnW;C(em-S#CEpb`Ud`@C zy+IX8t2`O!VTFAh$dlqe&Z9oBJYz5`r~hWIcdgD6?GgJXiu$0>Kqq|r?dPMNA5-+x z*PkDV*Xi^f5HJYI0~Nun1P~X1`H(1JiEP#+nvx6#Ln^2Z0R|ywpNbL?jG&mab2AjO zvo+f^?B=ur0I-19S*C$#305j4g^JQFQWBaB2tpcCr0Z%^31OEyA4OuAM2|?n(D~Z1 zDWA_f&~9KnU=WgrD`L~*1Bi>kd`J|qL^f*@O-TlWAr;gn00tpvuln$fUyPm?4G+ zc8#0B<=(-??Ps5&Q;K~@XqQ53xL;MEFhnF|6s$q*3pKS10738wUIwM5WoSk+Zzwa8dkkLLhUn*YWnhB1 z2}+~FN@-B|h!tj&!oZ5vuwq?EVXzAQB(DTa5HJXk!XQTk1O$8}K#BkX10V6gts{Gk zu*8F1Ouu2ZP^eyU5LG&dkJ2d{$x27=%bzz`@*#TJEefE+=>Tc-?Q2oxqf{~>{+ z`%;Jbl$Hx0NeQFz4gDqCtGuYeLN(-#HZeAr?U|4NS<|=x+aoZ!!pzI7M)!Y+!kV9| zw+riO?G060J)6$o8ZA|5!xyMT*C|!Ap4iy;nBLBpL2jl_ajb(H(jm)Bbfids81yG% zQ1G$#x*Jl&YgdS>@DZgn@EO%~TTl-YWN^WlZFLmH{t&6*~AJn_{nx{UQG-I;1= zef&bYg=tBA9D+XGrBYENL)z?AyN9T5yR4PnpGHlLv3w{`NITQh954{8imfR{wk)vj zWlDrPT+h2Y##Qt-Q&Z}ye9Edool50jQY|IhN^8(GIoK@E}hC8spxu_0VS@ZR_nUU z!1vE^Y3`5MEU;3#x((5~rQ5PZPZX91tM}j)`xTmTyaznU&i-rnz#FRQ@M1Vy{_CaT z6)mZRk>m-NOKvme386sZG`vR6M7b(#N_&024LKh*rbyDIJahZ78QC7OTO-+md#gpP zUXikP&k(4$mKP*cBc+2L;j1Y~C!Ola&sp#(O-Mrn3zkDrx*BIcD|`kNDbSkF{bA3> u`@~>4VDnmdpI5u~?YJ}-tm101d7HMC%3^xw;0yWZe;@9y&kz5f$5>G=e0kbmz00000000000000000000 z0000QhFTlQOdQ_?KS)+VQYt@9RzXrc24Fu^R6$f60FzWNeh~-?gVPLx>?SaTfeZmQ z0we>YYzvYM00baEZDn*}F9ncD2i!&s2U|g6k`0Ap?EYae6+qO9R;eCQl80@&I_1v0 z-62ixBO27Y!%hhU=RwE%%bwK8<_s?(RZuRE`4&eo$ngWD2Wme^u@%29ph4oHZp#%~7^H<&sA!YHw_fKHCpXQrG(&4M)XdP@3}4VV6SCE-;Ve|kSe&BV?n^1BQqW|Eniy;g|j`l+X2I23tbGxetqbH=dB;ot?#E^sm7s+I?@sp-eTI zghpt@mN4upxL(~`lUkC=&aVT$hUrqw=*^(zgLrv1P~~9iWiYyl0aGShs-=ExE=HLy z#W$NgQy~SNQ~;;s&~FdgR89GYH)QX&d~Ne0kHteCwKvH05yehEDHIBYRHUM?nV3+x zF}Nd&l?CuJEnT^$980+5^=^!OK3ah!B;ktC5(xPUb8mBkpHc!{tuL!@MJB!>3Q4Lb zZF(a{>*#b;mq~W=qnWopa8i&Jht?b&EgfW`>4C z>+CQ5clOzAtxFGc>ilA*v@e~z`v&hnU=nsS5;#|TWol%?8CEplPvkvxeOHIOTDE=d z;MU=g5{m-HF<Z7N@x)&3luO=K~PjwBqY?2&EWgL zksmR^#>#Kt4>)K4l2lVsv@Lb4MRY}*1}LTI2190cB1Q?^c3)K3PK}8%SJWmIOLC+$ z|7Uh3`@NanD8$`^4;3I>Zb4uYSwZlf8>^Mu08se>s0={#A6FHTKdbvy4P7$}04IPa zKsSh^(?U$>d)8nxfA`mBL}-4pBm8ga^MUXlJfGYvavY_vuN=oL z$5Hx9DMu-Dgm}gyhEQew+ zIZGWO7B0>>V+}FRG4e{Bq5M;S@H=<^R1-C^M!kTNJftMx0u40I0{J^U&FMc1sZbAT z@c>b|_+9KYxl3|6l8X`m$GZJ{4ZO=jB3U+MpA*D143oS8`L6B{0ld zyYoMvW)Ig-a3WfQfgLKOKmo@nOHV*N|HJ3bzxNrv>kd)0fV^76=d`B$*)_Ft%~{IP71SH>7Y2@wSK49!EUBXJfpj7D**|Lv-=S@^A0U<>T4wfvRS`hDFkaNChF0Hk64EMk!Tt<1G zolkWBm(w!#a3h2|;9w_CI&tiuYGt1NlV9%r-$}clT6UHyON$6q7($E0>AZq7$k6T=h8hv*o?&Mx4q4M+ z26f(EzLg7t&owB?gIcaJE#VsIS z%NAMnr?@I`K1SqYTNWmCs=#MsJioD7TuWioPvog0JP*kMke!>V=!COQ7@;LzB3IhJvb(CE$r%AoQ8ENp{-Fm|FloZy4tW5P)Q+TWL{)%CxE zQ5NK~*zQxB*-ftfSeqPkry1AAp4OS$?rCp7v$BxCssJ!60ZLf~O8iKWJ&7XO$|{iR z1i)6KssIWoQoST(q6w{`s4*Hy+Mv*$>plu(cLL;YCm?DYB->z&bsT4<(P_qOde@)& zxW=9@V^3?`&7JOhK3ba%q)?Loq*K>>{so;KdQm|tD=J+H=MHPPNhd4ttq>Xtwv!U5 zyic%onA>IC`hTXH?JR>1;3X+jj*{brSado$7p<#m=FcC@&de^j0RR^ZP+EYL4xkbN zp2X;T*afHu;u7T)Qgu1L-l^&DsVMEDaoM`5TSIT{Rr>Zs?$VX6G(hk-%4lSF6bFz{ z6PAWahKGl*8;yVZ50?@SsHs|@jB94GwSW%r@Ji*^~a$G?Pjm%WLoT&Jq~F&Pu5$$3w96^=IE|$0J>BFttJbKU$wOc3MX$ga%L) z>wiu423(>@0gem^87uELIGLOb{3}}aeN{-_R=dLgze|(qo?UHe5W9_2AaniO-j<33 zSOD~wW^!bA0(sI=;6GZH(hc<#rzMBX(4Ht7cTd%&`E#+AR}L}lI_{lQq%x8LH4G3M z5MYXZ-_%~J@$Ou0dM~In)ByCzsmw27L|V<>{5@&p-<(n_y|q&$U1|X+L1CrSL%{)n zhhPxjn*HpjDc7>$P^-C&k9;$gim-puTY>({wqJ#Nr$*uT?e}W;&VTR21`6zC-n1-a zmHQM1RX~?h_3iBJhISJcs*xqoZGafrGTr|`Tc(MHRu=qw61DHg zC5J6Hs#i54qP`nn&)bv048Y9ahKE=yl1h+NQk#msz1H| z|Kd~;jIX&?+lYvOOp2eg>2D8NvkdrkU+=iC`*~feF;a{)LWBron1nIq-eF9wcO`Aw zol6ZxL{!|!5EUHB&EJNpe(LOfz0~vh7$pkUh=Nt3Bbd_u5VUW(-hhBdo$nS|k$Da{oIr3p9;^r}DYi7;Dve(|_J|-}@dq?};xg`r2DR`m8^H@M- zvbd63y?MI0y0W^ly1n{r^;_0%o=vT9Zj4?e-gggYw-1J$lOf=A})q&&NX-3msa31D5G+@T>seJ^foJ% zP`>I_!`f)8drf1wk;WO{1WnlVn`@WGTI+1=m2=Mfz=z@aC+>5=B?Zx(E z=eWiJ6Cx-|U-K#71k0cfnjj!fCDqh2#yAr!VljVYctknloGWg4%sYH6+71(3Xwi|M zxgCcjlR_$)$Srl^Ri!mE%|Kwa^c1$ZLy;Vpo0Y}=t<>}ZNF<9qbLk)L93K#tvv)rT z2sjV`dar;0pnxux;MU)efq~uO?S&Gkh5IFxgXk?25fhjqivi}?;f60kSXux8;E;g` z0RV)J$z~mPv>Ops8G1dCXSw(1`l8IQ+J6E8Z{*k6HJGfO^)(>Il!Ssv1ErZlMbfW*39Enrj9OGmRz0!f9 z{MsTOy>+?P86p`+P`mRvF2geh|I%Bh_}2Zwk9w|FH)AXsXW9jZi~My6)K>RA_2nrw5ktEL9_ z0D+DsHbiWgg4?U1yxx0{^%yD`5{?9M#+u@tgh3yhM0=d1;n9Q8WK;6K6seB_G+A3E zFQMk1>o0S=B$c!|P5oEtKdJ()>al$-=8ScqHX)I!s~__X103cc=l~ns_E0fU9M8Ij zpegKAC&FJ&;X30yo+LqQfv7is3c0wXl zoe$x%@FP~e5gH^Rk~Cb??Ie@)@QhzVmvk>$^ZUuu0v6@^OVW?3mshbrV$Yw0s^037 zb?KBGZj?ioJ_3jBt*QyYZ8U=~Y{k&KRa0J5Mbm$q=3}pJ1N}%(i|;#5gj{{2ZPq4+#@@C9G-HQ!#ozeA-TUZ@|_^mM;2vwgcs z@q7CJIMyH87(#ywydVgfmZSU2(FHLTf6T8?wLweX3c$jtmEJ9Rkym(?H#o~%9EHq7 zoCTBS;SJ~5fs7x5zSsh8C2H^%px?cGA<#g{>o z038MK;48DEf0m}uC*L1DO>GwTG$VU0_=siDFIQa8MyWGgLY4}=3>9=}qkSm#+Cly= zRW4FF=7tw^iTO2E$4|grRgaKpyHxG>C5$Io^sjR}Lml7qvKYnZ!Bp<`cIyf!$Rm*q zyc8)@bGCOm1|R8&H)27+?e}F~PRK63LbY6bjnCAZftupNhzcmhv1r}5oOO^kaZ1TB zU`UqyB~o3mJXKSU)x*XK&X_`ol4Pa$@#_|GXlfj$qi9D7MI|Dz7|*Afw%CyaSqS~8 zl~6_O95kl}&SPr=|rK70v1OMMk>G7;;!NJZ6=vMzo4zq4$yK zShejNOm_EJ3BG@iMG$zAkm!WrFmPLbOk<|4oJRYIc5DK`TG639gan;^jW&?P+ZF{~ z7pPsJ!=5yCWOCx3!AGi%VyRP=$JrSzITezm3?wp_9tlgU^WO2H&s>?*uSacj5Yl>h zXhIhdA(F2ddD&Ly9QrwCPGG==x#fPpn+yu$VHy*#`u8MFClUg-NiFC#nz>?64 zg$*M4CZcSnnw)TDff)})N%}bvW5fP@fS~8Quqk;_wUV#0X>sDC=Xy_C[@+FJuP zy;SueCm|brh+z{!6TvIdbqb5!rlM+ObVNz|?XtE)9P0)lL zqF}_PCu@oeA(GjF_nr*iZOR#*)-Bm(+Km{MGk&~by@_ZfI7o*gB8Zq=kv4k? zQT4KR0^&KN$YQa|28%vGZ6#HYpE{O_&piw_*BpR~8TT7y43zJND&+CI71Dx}wdZVR zZ|$JH;NgHBAmUw{<1z!*!QLb+-$0 z2_=j7J>#~0Y|xyitCtnbU(l|K(Qo?860AR#XwAG`p>Ncwa!b>|*>yA1U+8x;w5X#U zUZ^hPRv8!5o3`KmmNE8=>pHjCB4=ToUQu-ldgP_QH@fA31YI(B?`)}i)~Fvkr1~np z#+Nm%NlvjLhuE*}T1I3OrgD`qE;<)Xz{CJWsR5vx;s)T~PFLC5y3+6~R^=1_=YK@< zN(|qF1~RVzq@7Km*%EQSAKPp&G5l=Ux)qJ3_R;12WSu~%G?OxAMir|b zk!pI9ho+kwuo<1uwI2Y6rJ@hXtCq`$_kHBNp3n9r27pmyMmDmBy$+3fVNlFZEGIde z6S$0EKH_y6@c6dxYW2uO;&spORB)e?2T$yRsxxC@YDbskO(U*zGjj%Vt^itM_xPHox{$}KXzu-$b=dKX=LBS&v_GNoO zM)H~5MGL)Zh3vtt-rv{Lxj@}_6YNQJW!0JZ8Iy^2{z2lk*SpG>n&hsB=#Im2`zBBr zg3(fVkvO+ntn2CrtnHyS%H%+bd?zeOt7;k68vj4$Vyr=#co*`TO<8ln6;exN37zm{ zXxMP1YT}qP17zuUe(tlW?>+CDlg-d0SF0Z|Y(*gQ?LrQKaYO?D}}N5bn1mNjm8j#k2OKS;R1C-{1L6 zk(ha~0x)H7c|PZGu6AA~9dwv81~uh@j%28YJ>36kwD+u-NIW0QNNII!oMnL)i;liC zwvtCWAg@_g0EVSD6)}D+jpC@w1&7y0Uh1lZ{gzus%FskO>+*ON(73k_CtAnh6{8ns>$KAta)Cl13q;1WpnDq3K3RQKy(3ZA4wyVmQ zh2@PxR++SDBYmH~8Ck7m;+l<>VeRwIC_0hxES$<{uAEeqp)g3_ zYo<48stpoh-QXnKDLSj-pnC;?S6GpeSb`SImM*)D*LsOi>hD>yQ?h3Wym9tqoOO_j zx)MyrZnjTB4D3i>0< zsOo!TAC;5Otd3nAa<5~!EO~Kff)-B4O5VG)wsCpAEGf;f&^$hSP3t_cVsXnF%f@n5 zEiWit{)Gc(nnoL~qK=uI&}))04Mv6`Cj&;eQTJTGOxLXE)WFywMZ=G9vqs6>(&(U; zs9UGPM)hi9S@gi&uGf2|&TFr;WR8WZ=9Uvx9kW&kVqA8^VtQdK(shubK-pz4ltC2& z^r_Kpm&MLA)L5M6EH$*5MxT!_rc|9#ExX0YQc+wbiXItA)b;e#;oef`KVKV{B&&f; zo(O4v{W9wS!tbD{otDKU{T;p#@PlJ`_f)9Fz5o;kN9wjV!LH0Hl{`2UZFEFWodck} z`2ee>K$ad|^6)7fkl0`yXCf(({0;)7dIyJvK?IpX*plB;s1WPux`AWn2hrgxXonzc z>zJHimaLK228@|lp}F+I12DOn4FaOeT2u^P8x4Wd>{O|NkAwCK5HBLufms~V7k2;% zQ2Q(==D*!wSJ(h3@=#4EV-r!-g+M%rd=qlY>k2t&8;9}+fVeteG|5`6211n#PYzsC zKS-4D?T4d5PT}r`V#}f&*c47Uij-7;K$AA6RkkD2|5TI2n}A`pgCm@kO=1|>!ZhVU zP>`*?f&oxL*G9s=%@+c05M)|2IGQvnuWWD}5ZU?xz*AWxwBrD5wTrt@9jXk}GB~!y zdjo*4KbPnQ;6d;QXpsEnFE7~qbN~Jcu5a~vz|MEIuh9FSMEVO3e?Yx}_lsZP;NIY` z_xgbFZ`J)k;a{HN41CRPDFX?ey5@^Trw3EQBSblydj0XrY!yiR`4+9y;ja=c|K{nl(U(C=e79 z77>6Jr^tgtApr@QpeRa}t5T~`i&h=F^yo8Ct0ANBn4?lkT1Em}o-(f_F7;V+Xy}3; zg0v;l$fJrj##myFBd&Nln&Sy1lxUz()%;SdL;;1;$|)}HQt(ib0uyFMN~O}uYptr< zPr_Ks#*VrgsHZ+B)aczjr7AsAtwsnzTRqi$@lwxCe!q~gY6qU!T!br>RC1}3h%(9| zs~pP6C69axD5U6hzee?`N)wvZqE^bX{riUUdZ~RKsj4%xAp{ayYPnU`3IagvKl96S z(_T3;a@4tGojYNwXXltC(SS^l6lx{7D|-bsM?f2NKGnu9cgm|=kx*4W{QGp@Mfi8m`)Nq|7X zLZFz!abUP&@ChVHl1xM_lSHn3g^I0GuF7h48Z_IW#b#}`+F_^Ny7k)UfPRM^HSCyi z$DK6kw6o4b06<3U29F0)DoLzu++0HFj`3CbdgJG-8wGJCsY}zjA&?dB@M_3W?oY1kuvW}Wjw2oqJ z0vae8{04%AOKyZ1aYjYi{0d@OqrH{udq|ZUKKY5;$KRc!(_w`SYq+)wApqC zJo4u-5A$b3zqk;$MnW$dF~0B4%y%)Q0_%&c3Px{+2LG4k7gy1WZa+poFdr@txOdFd zWsmY>zW7du{Ft8JF@x_dGt^br<#vN@R^_WI==t`$NP~T|dik!-)>*dat0|m$W|>K! ztA7SNtm?48!^+M->u~F)Xm|YkaTFRnu&D-IdlM{^J&x-gut%hd_a(dH68F&(c+wTI=hQZmpY zB6uJ-W+H?jUe=(2U}_C+WdPw@mfZ3Ns&e62Hg`~&ktTp!X2qC=LUyNJ7E*&!!KWm* zCB&KyIId4^28kwv3kbtTK`0uWlNrdMnJfqK-SOeKH}H6FX~$`=Ve`~{hvIAE-g8antYh(nwm1Lr!~3si4MkIaT!Rax9QECXbH zkP&0owc4g&@AJ~-%Clv9axQFf(Uf1ohWi?_G+8YNcfM++mAO1P-zR~w`}3FSi|-A< zIHoptdr%Fmd@qiw%R8iEktvmsAHkO3Et^)c$Xk#kCWA1$%R0z#<(uA8-w62 zZ-3b9v1^fG&`@=*?bv|3zzdHVH{rMwPQrsQ8hXG6-nuGzICCtez*Q(Won$1j^j5z_ z4m!NPQ9E`v599Zcx%sjIOtP{J>pSMcyd5!+KJ)8n3JCOl2@Genm3)F zz(__u>d}vRtkXU2>37k)KJXDh*uP;ke+LmJQW@&*;pye=hT%lB{b6n34!YEDv zM3|%*y+59>_vg#F$cwV7n>N(iP^(~zt+p8>5J_YTl}0y7XQw1{vNEf{b+f4KmO&YJ z_oPw7+6ecy;qGB0jv3vcO!Fzz^6_K?9(B-R*Ls^^?=IH_{|OCgOjDY(F6*-)8?z}b zQ~e9lnJ)>J2(N^%Xq9bCwN-m{)YjTo+iOR4*3P+|*dxa0KZAh8!!IHZyYZQRfUuS3 za%Cu_?kp@UUYC5#UxJ7#-z)HyHaL*K>Z}FX z2})H&t*SNnC{pWVyOm*00&K8&-(>y{S(g{{wf;F~c~~yi zNCDrteDKP*a>PI=C7ZoHM;JWZ$#P4$+=ta)(8nI@}x->S0$$G zb8qUXN+47&qB=1Qq3R*l)~vls43(43^& zYj=JVnb$0CmUgnx)U`&XyL%nf{d%-8xUy{%#L*Z)@Eqbf>G zP-H{%xq5xv*X`Zii-C4`+&wNL$!%#%pE@@POGayS3SUU!@Ih@|A<7>L`zjBe zKV!?3WqP)qb-iQgGsD=O;O@Sr{@6}G8{zxfkFjs0JAV%Gt!M8t(4MvJj7Dez_}Gb- z`bHDQwod}c2|Fs9WMr>p)%LaU zDz5)S`933f<4qDzF-M-A{!q)rVTClTl81Hb0MQ1RJ|K(%WezxdAUFff9~i;F3I|R! z@Zw>Qh2f~xp%&z$@9Cu9dEC@umJ_{>if@V>b*x*@QH7;b9RErOtrPUL{HaTVCEw?m)Mgd=Z0q2^KCfO!s zM@`KKqa_)mtxXR){(tS$jSul-0)a^eRn#!X8hadY#Ft<+A(ExZw!ubQbnCOoWB;lB}mb2AHJKGz%@W+*+G$xn=h4wpLj8$KrG2{=a_r84=;9 z0#{X2MSYSk5FZeR4z3W5n3bU5u74@o)<+9~6spzyd|UHm=7acar(Te)Ts56n`_zf<{D|3tTnS)eKQN~x@`PhR|r1D=+{{jD+ep%hwGM3 zaO+1m@3u=nOR@I;ber%mwtallNhp>+sCSsV-=ZSA@@;`T%v zrPTK{%IL8~dEP-kG6^oXuJv)>{txyG3G&z_(bKq%Gc}}dFC|=S^uq5sv_v_!KCp-I z39hGIoja00l;MEhiBC3h2ypamj0Mpv^`JOa_7Qr{k zu~rS~M%$|3H*xiQ53#5%LLnB18!F4O3I-SSl#$NTjcA}s=r+s7deVmFtiy)Y4h^C65P?`J6m%~Un{D2 z2xq^)4j=~(ga2;~0p<1R>yLtozn*%KH2 z931u!;ffebi<{3nYo9d`i2+BQD|*!t50;6_IZ02v|(2y!4fxd?aC zLE=K>TZYn?_@@@|X-Mdv`G5Vc7m-yQSx%M1X=tmnp86Ro(<(Q)**y+=+4@tr>5AX- z)p8EqJ>W3Mx@a(yiGx2HG;i5d&6Zo^X1Du^W~Z;hZZUPW2fo zFQUj#s4)$JVF^3P(cusR5vvFgAqPMm?O?jC>2-=urg^O8$M;v$3vu^nH9yYO@?mhm(Eb9>f87I8yHjuUs zy1Co{QQ0}azuTi;@=0{BmS(m9KM-1$J5K%`5T>+w`*ctWGb8h}r8{~@_w+!I^@)i) z+Y9}o-;%(?w|7qsLQF^rA|#2c4+}JgaU=P#>1R#j*B`9G8mT7_UJgA{1^CAOhxLQ? zYKI;3FVDEHWMJRxp!>tE%<9CYkIH|z{ogUrwCCdiWCI6=hfKO?{m*mEL#Z(UeOaPE z)m;5bI_mFzknmptYwRi4jL*(He)KW=SZCq#LiB?FyqM*}=)%Clp@sbm`xd(A6U%9f z_H>9ueKo%@e}Deo{J!}^^TqSo^Qi#F1oM%5@5_Afyv4jM0Dw`-!v=s6`v->iHv`<& zgVk#9yq){~#?9;30dRjP#p`gj08MqEr11%Lx)j#O&kXU{$t9oiVXr|hmAGm7p^HXc z_Y(LgLeT$AGs`mmf`yB!xCA}&am^9H?YIL_^G08H*wYhjca+r~f1>K9T@KB^PW|}y z`;x<)&&6EM)l_t$$LG7*o+G3W=;1Y7*oGS2X4_r7`MYoRzk(Z7AXc#f`3K6RHgm5k(%tZOOIZM=2J%Vxt@(TozD%gy!KHr0V0p6sVP}= zt+dl#S5X-{U`UPuz6ROAB6D^e!BH6T#A`t&X~UslXMw>29b#m1>B|#+zu8tE5|oCePrx zMT_V3dBceJ^n9S@EfYR5@mZk+a1g*%8L;HLa+SbK1@Kh`d{l;jH1JOyM#i8SLeB_V zT7+i?Crhv>aI+yQYofD-p96xNh|Q7sT*!|`UcnRVpCo?RdHBd6g6Q~7fD00 z)W>ISJUYu^TUo3tfgMTMS|;1ermF(>RZ5@3)i6*kM_Ns0D`_nidG7L4mE|fzL>AP> zr7=MkWpS?VlFZ9hS&^k#n2cm5JF$#PPSO)iZbl|6Ba)wjj4R%yo)VAnB{n<^iT%mO zO&d3F_{-JV@Q%VKy+j~2FjxYSM5YQ*4B!MqQ{e!M@=!jiRZpyel#G&!nud;(3k>1r zfg(6j8a1&}h%g}4T9rQcNHx52XgPQ-KwIEwD-qg8jINH`_Tppv7NcuO(RMPlgIt0_ zk|dN~(rGKqNYo-pPO{6t+K?h|PF6^QJr%LLe7Y+{jmI>2N|F1Nc}RsvNX%2>0Rp!X znI+2|a?Fw8KaL{cEFRn>0)-#4`NhU>c9vxEQ$q%Xq=z_PGV(*12Pt`x+HypBlbk16 z`6G!+PJ!eWM0NpeOh`*9Y)(X5V%kfmqYSo`#(^r>UpWV>q`#`e!_oSe3iieJ?+5Lt zN_7jSyv$j$Y|*s0d4nRAl1!u$I`-wLWKlg4$vlw4LkX;y$UO>1xygcZ?sA=Z(=4-y zFrBL`Ajo8$h1`;4Ob2mBYt83|E=DXRM#ek~cerMT2z#v-cJ`K-XQn7yF$kP!Ys}@g zgdF}hG&AECnzo1!j=Ji?O zD2F)4VUBZzVGc0Fp{F_88V~?f!GWNOJPM|s1SvhpW%ZKLBt>JAo7#_63xFAUygG4Y z@f{OgmU=xhS^e~{kf&-G$L*Z&YaGGj@(fMl_!&Fq)r@zacpaI4cClC#VNK(8`QCq# z5y0)50PP<@e|Z4ZRtHw^)EN}Fr49Rk!In0C*X>MKtMq)^Rs@02*hcH?&@*HhszEc_ zh;WDg94*O#+yC8muIO!aH&I;@^u2^ZKEqG5ul5F)mzlfM7ax&f3Mb9x@ssK1`}EB%le?Te{Ut?}N6zAy8GN zfGkO1;z?qT7%^5_C7vN$6owa2Q{6l2P63=>qeDokR=Dq*i?D#ZP^z9Q`UYw~ z4aWvwGe*2iX8!MzI>d)0h{N9!HBRAZcNAT9Azzn`k;d43DpJ%-p|Lt78xD>@QnF#& z!D7@cFnt zBloZ7KWF+4tG)*Dn1?hPc(3j8j0OUJtE!N4kIuj$FFUkoRlop&2kzp?fZHAl&UD`O z=l&3B^q5&a=$bvW(CpshmN^9sWLlbA(4gSVGuKeJDI@CM`U1HvS!%SR1djhzVTMIn znZct(T!1Xdz1Lv}PG(^mH(LpH>z++~MSn?hK=RA#tAddg<(m{RoI=T`Nw8#Lk8o`G zU^~$1YWrKc(o^s+!A5jFd$GuFQ_L$XQaa-Tb9lOQc3z9Y5+NW)A zy4u_hC~Pej4Ty+rOOz|-ZAA@c1SddG8VMaDj(`}$8Qs(6qZuN2vY)A4?ZHGY?HL>L zi23dyfo!gm-!mHYO#R$3&| z7f0a7gyG@|x_Hz^ZkD`D(ER(5|2Q}OS+@53%ro1TG;*3Vvfmp*U(SUfg_vM#jvUP@6HaNn{x4+tX?w&F2NX91Ea zju=fFZx#LF#8c(D!XGq8c{@IS6^sh??RRKx>w;t}Ks)&I%Vw!yAhKBW zFiwfmffz<13w{!lkTXz*EmFNY3MN{nX18~bj+(xZCb6%$g#n)Iv$Qv*G6UntF}OcT zn!VYq?=N?Vl;j!*_M*`sC`g=U#m$Tn84$oLFL;5gu)a9TM7tMF=B!bx`jO<{v=rmd zq-i*biR5#P{3Ql2u-zW%@msyaNPLCgA@a^HY31E&U0iMLqg!bc+J%mzg|w4(c2`Cm z9?Rky-Mvw*!fZ8@&09{bdxVDhD*X4Azn#;aL(A^AD_b9j458`cCdHJ%ixretKq}MRNdP^ElXV4KRoxmRq z7y>HTt6xGMb=FWG^|~$c3&*Yq0Ry1|&mqM(O(E6kTFs=IG&~AMsWlBGZ>|2z98&ji zs8`}Yu3+8{wjX&|x9y-XZMdp)Q~a!bZ8kp<@@yH>Wv zMIS|2Lw-ywZOmkGM$6AX<$^DTBR)#FNYw8vGZkn!6YzmV*C8@(l9`$Ys8DvCY-7bQM+-lGO%&4?tuhK|T(FoA=PA2CcG@qX-uUfohF zuxF@$bpGVptS2|s?acLdxgh5vIp<`Wk+XoEO)$v~Ub#s1LDIB#MBC`huahS@;=&UP zl4S0-{s9fekseTbarhOP29lP$Rm~K%;p)peZVzW z%uPvBe3yXDtTUp{Z84-fF0eJOV!%G0tR}QvBAgI8cG?^=JkA3+DT^pusH8(u7fgPH z&*!$lG@H|7B!Y7ql%9A&XA}%ey9UvpJJ^8PnIZBt$uB35&bEeYG8zOW{IBftgyS|i z1r#ufxC+?JrX1ucOY?YboSUg>o7;4$F*(y_DLyZx*LH~oq$3~byS+68N$FH;D@P{d zgB%Skitiy7B^*gHneXv;@Xeu>fxCX$dos{haE>i`HU(<%?)9QX&w4^o{RhdcEKu`X zC6wJzroP)q;~}DI5?&VEduGFhlo_4VG!G1Fe(8nffS1;Sa0Zd9t=QGbqe0t*TLa&P zk$;;==T6F*3&$rrAP{xmRZ(t|t>3rLl56u`!l1@0ab!O!b%0T|AOK%>dDmwCaU2a& zB=hUiFXX%Moqyu?1D{L&+5|Sxs=TokAF?eQ9hSeCgfIt|lj7PY|6g%z3d@>{JoD6LvXRtorLz>HGQ< z>I{`vuY7?~U%_|nw)=Bj+0$pk3Mjabff4m2?cK@Wv%+1bvUnQydJ*sbJz#T9y1bc? zNB&F@SIf(i761r*f_VzAky<0BV6HL5qN;YRuV#e&Y`}eGnshKq+%B(IKfyM6VyS9Q z?v_f(hd^*3&<5F`9-As#tCSw&7oneSfSsSg3ax3l8ctd?Q2?`1b>}Y=3sYL1|u#8YdLzW87jt2_>H#8>PVybZbJd zGPah9xr6W~*;Lsoc!wx_SIZyjbh<5VFqM_1R%qq6%Z*ag0(CWndN#P*Z8vb$`GW4C zd_E4ZABOkIU`Ih!ayh7`78)t}U+zxlX|J&{1Rjk=)*jaiuJ~+5H*`J;Q*Xc8tx8m=1^)lorTb3F?D& zW2Pmnl}&QlBaXj&h$-I7(DoZzpwhCj1W?1DaTUEJG^bDBvEaR`d%~7}MvwuTJ|FSV zO6_2wbvc*{rk&a-5L)Zwym1bbX3BNt4*(Z{$t^u3rnE>YDx1LDw*_ohfqJQg6eEcO zfx-Q89n6b1yboO*4Ayeq@1F*Bflb18CH|v67KwFz#1XVMD&Zg1uu*UZ+vnc;gg3`f zF1v>d5O(!w8f<~mm13x0;Y{<^CW_{4&a>=Q>uH@c%YV8=d5k#Zu2;IKy5*3Z~)9ZX9aU@i>~|{~Z2)UQjXaIFm7f&; zxbCOc|JX2){gIPvi?nV*!)r4MR=Uy7d6v^U=K~qF%P<2!8dCJOY9}?w%4^a#qre7m zTWMlChwSDRq47U4Y_wxfwT?BMs}b0+Sa#q1*)5nj@m^?EmwfVOKKsQ*%3XiC{cLY| zj`OSCqPnHGdgP?Uuk{s($oqEzmyG*kU@wK&c}UQ>hgmR(ZwkW|;&avx`!7Y1qV`Pn zH}4(0kNy(0c)Z=3%~mSdI+$C8n}OQj`2z(nWX z01iD;ip4&?K7P+(8?*Q&piQNBY{4jJQ%Mdz8}GE6l)+?59Pd7x+&Xml!gqbnD3Lm| ztHhQ^#ibNnF&?VB1=FLp%~{AnXHC5>?!AovoXC0b{x6r|pZIG~pHiO{o`KW22D1?l zH}t|eg|ZY_Yw9R{$H_kTaJ)Q5DIJoAP&Kn~<8hBZf%E+m$9C%3km#AZXiB(nY$M*3 z-I-}EuJWBH;5Z(GlR@BE@6pV$HJk=V3LOSza9ZuP&3FrnH-q||C*#)JfOOUY_7v&4 zn{^yr1cqPK$IHBS9QtYb05L$$zXpu#D5G`!pFN4EERC7KpLU2s-U!3l5Hrasb@~7} zs@D(7<9Gs&;;|<7)JtR8{CT@$qo{8RBkO#zV@ICj%#X1jmr!61lj1-oJ+!es>9PcB z+->``rr(;@mtZ{a{*-y!wlssf)USz=NL^90{c(!UfL`oopDOs>%p+-#+U2isw#RfQ zZ9V!k&-|YWu*_H%e&;bLRl?t#kDg;*SVgVXwyV!w-NRYhWwaA>cduKEZw;vJt+yR3 zS)_KCO?CJ2=Ax~Ii)q29nR4}2tXQzPyujlRCzeq>DE}>0HVx{}3e+#NXos<* z4*a@IbSdw)UXiVf#}jc^W7#%DFRR7=`9i1puU9nETD|KrbeWCZ1k)T1RLTa;G-PYZ zjA6=(&C3o}AbnQGvW~saSh85kdv8CtM=i~W+81V>z1t=-_{vkeGft5952uyi;gFg3 zZR>y__a{$LNWmsG6rw^f(%p;iLJMZI-IHbS0pta{g;689GdD7fym6t=4Mw=5x+RNz znBz}dXxq03nkgk&A`pT5g;3!4;Q7~*}_BEq+1tGyx?D8NaWVm z12p%bao6rS(LyMmP2T5TBU!`(9!!J~TF#m7?Wp1TtI#M|1{jBX%G8?Lfy^o`xl+#4 z$Ka#KZN;}Y#9j<$7rKYTw}l=U`=mUK{N74{7VQ;N zRTWaKw#|8cLdoHu_svR`uGT**P2ph~bf2!Bz|-DxFL+`q-2|QpoWtUKTA@btmSjOS z*vX4=@{rsopnlY1HcHZ`u@_ZAzdqxHQjTTCOA775uLN7XeaHsse#uyx63nB0H@^*_ zuxGMHAA-j`G0^?jOW4s>Px|N!Oi1$Zi3pB)2!$Tm{x4Xp>rtP2j9D`lRT67hg>+EL z$sFVD;jK%!Oa43NCRDclE&jQKl|TUeb9P^bmBBcC@9y8v<2k|yb6h%0?*PtpPLwYV z?+T4MDKy>_;0Eve_5YI-lfyJJ0M` zz?1b|z6?79aBSP9~_DTubix$KeNX!J+dFAXoh#1@-JGs$A`)sv8|!D!{2+PG$SV_!dx0*G-6BP5fvKjMhBIBye_-{jPK&DqXfGuYC-C_a z@u_v>H4r#wm^Hv&Pc~Maxl%9;Q8lT5EMvPpV~y8&M`Dd+cm!#~VWP;gC!DC^bj*cY zHlMHM!o67-IY1m6+oOAZPy=JVF+xi#@M?#+=scRNM#`!b>`8;K1lKms&nq~YH`g7q8O3(q<+khpvlXwV`O$BZFWH7eb$bo3 z+ek=Zg#$BetX|d_+U+(-#u&laRf0Xj2T{ZTNV213lnMODa9Pu zp)5@Zb8a7N8H;n)#dJXLzp28TBN#0?d8*GFMlRut1}LAOVq>>G?dEhpKZYr6P=Bh> z;rV5ehcL)WXNrn@vpO{yQbKdRX|*9_%}w;L*&Df@8H(6lAs@3JD`s$)&EAB}2+)%f zMH%;Wet{wv@e}!NWm?4+!Lo2H;&g>E<)k21IWhjs@A4)ROL%^naQ%)t;7bn(8ae0< z`T5MD#t1=kJkqyzXG24NqnHV@(m{;>HH*W3E5v|>lvtJZ-JIe05W{A5)a&#_V#|ou zc@+V&yI^ytHYNPiCHdHqVqPR}3p>>VOe>=qV4WaEI>BjPwTz2#5U?hN3S;=;vI6gP z5kHQ0x$vGP7*DVaW#c|a1XE24V$~BY=CTB>>U`Z%&W~QM^nWGSox;O9U_;_7vgCAL zK6ylu`9L>}_AEhpf~5hBc)%T}bX%jjx=F3h?T8rFIpY7ECG#zfJ%xK|x)g9rxxBm9 z5AnEyxY2RIVRsn~VH2Bh5R#}g1Rg43BInJC3X3yYYd-NFcec247c|Tlx(*L~KPBZ# z-M;lcVzCcWq}<(yHeetKUuz*Z!kw*M7eZO_bF4ON7041`AZ-LeG^~o5(iCsxql6!g;`~OiwhcQQ!Nzfv^EW>+4zhC~BwrcU zF>K-9ZK0|)!}lZ{3(=)dAt#nnF0?|sK7%G3IaSqmjlhnAqcN(luACsY+@^EQ5I4KZEk0_ zAEN1A*xfzP&l6-VFjQ{BB3?_=3t?nTn5^tUbj^sTm^!Ow9=Kw91;#0ZZXQ=&kjaC-T=n# z2RDV;9d8-vIsEryc&Z3RrFy@X;F#pHyUc>zsLS!%WDIc$nw!hE^{?&zNd5OR3Bz5i zWcqb3aalVR70YRRKqkd>hMSFfSDRR`VYYRFpAQ70$K$rtr zDRvF1Tm9OjIVaKaZ@tC6plVptXldNSYz;#hTBNFHK0`sJ9+|VDwb?t!V1#f1s&K$B z#N>CSdfIqC@E%#~54AoPJ&SnJBnSBBKd;%jZ8syVUhUy*>Z~)tdl*;PtwtpZ%RypwS3beMR8fD$-ffixM=X>6n5x(3Fxc3Qk`xCtbAr6avQ-IWbpO)x@-M z+=Fh}hNMFI2L0jm71j4q7_Ky?97}S_Ei?s1tG&Xsht(->b(K$;@_1I|d+hVg1e8Kc zB}y7Z+Yhji(a{wGpC{TCF+O25Xb6j{soIjQlXC5Mf?2(-t(ccIU|LxUMWrcR+jBMn zS9;TxKd*6Zr2Rii;Snuf_4B(S{H8meR7ZUlvXKCDI_$VhoVA+OHxG9L*>+6sk<|!I zUtLtVH|NveieC0Q0_96`=8E07QLt8O5+H6RrWxUED?ygis3;ed&16W z^feT0ie$3U$(d7v)ij3>@i)mTTW7X2l#>*X)}Rk$)F_rVSi)OUf)<6KIlM(ud*kMB z&5JS=L}#>Pc`0Cj#cl0vRpYum~k6$wo7Q?!z&P)1i z#uI+NfgpMTY-WZ6%wPOpa-sAfdTScJjYfH$(?Mb)pY_L||zy;jAklQ9D`FRxh05^U9=LyO8?SevZ=;Rj@1P3a`u}3_}%!zIE;}fJUA0Q)B z6ktC`dH@kNPd7}bjN{KJcWl0;LZP7@T~T+F(4_up*&HDCWQW zy-YLH#_3b6U_!Mg8kNl!yR{S}LO#pQ1oErCN>*IaSrD$f)wdFENksJ z*xqoS`)=I7!j`*|9juLB%Zm0zMh4|GpY+HN|G@32>xg*y%&5h-*Ea*4+AZu5CSEVx zD-56IO5Y_08neXA7dbGJ*c^*Hqsj=ZHnQFx;JrRC?eUKD<37rW+rw0Z5BmEvg5>h- zpuL$X+L0U#Nk~S0zj{C2P8NWiw$?!hph>yw7Zi4veWVSK{UBW-v~&#ELZIWEnI|y9 zs)YOASTENPC-dgN=<mEJs$NZmHI$1yRk8JA^shU41 z31t;Y*MAZTq2XAT|2B*Zb+vbdI=T6?v9qH++!a0k40{IVQf|IxZE{xrijE-c+C}&) z-AMjNvd-#j(L=4&{3I?E;Dl9#sLkw6&H|+wZPU+`K^->1rY`05S>O86)UVS%#vTK< zJu_Sso?}J2nIO_NZ0VT|IL&ZW+Nk(;-6h+KCB2nXSF?4U=+1P?jf`s6? z#rV+04S{k@ML71cyjWl2h>ul%VtF*v?!w(s8bFgb+`Q57x=ck}q({q8N-qLNOAp^y z{=!{xyWEdo^~6Y41LY$DD4#VT^>MNB_kis^O=B^Tc=8OtD`fFzz3}Yf%oCj{N3Kzr zExRso6P~0SCR%dJ$taF^WLdaHu(CK+7N6LVbx#-M%NM629=NiN>nN676{@GP#dW5* zr!KkD{gkay$6_z_V;(l2hpnWf#5>QPOB_lP7VbKCHev7zX_hP1_vRnLiZ4jt(ykaR zuH5pLS>#j}2a~V3Z-G>AU3iWbX=NCsFszS$E`4;;A$rk_Nmuqd!a5emO_DJ;$RjP` z3}S;Ws&IfwBm*I*l4vmV$-B8Lp`#?KuDpBw*v5?;w!FL=`xk@p+g#PL9(eBN_*nSZ zvqw119GFkHeVbwXrXBO?J!%RrY40=IYt)|aUGm@Z=rNxD+zPAY^32CWunYMqS~d}Z~!mA!N* zY41?QdVa`hAGYkv@*JG7or|sQ11%Ljt$zFZ^s!QJ(SG#L@-jeAoUp^DYOWYkvG=^Q zeF!0%_R9M;3G=(>^yXaPPgxM^&CRkGP?r2+p;KH_dUb2E-jrWNzcI3uFTKQw!#rC2 z%oA;RM~m%@kIWAmYKV51bvPEZ4fps$Q9Ke&ry@MALDjOb4Uy2O1n||*@GTL-LSybl zR8-JIw@{he!;U)Mpjg)0rqRvri)Nmt;uyA1IU@K>D%%IBz<`T99} z&4YbZ@%q3}LzC_te{}P*hvH?_`=brn8zu}UFvB5hbsZ3^fexE-Ir%r9W6 z2(=~yICcI+l<_@$b^al)@jb9xM(+Rl1IS!ug$Z>`2u4s{QPQNnOrp*|cikeg|7U*? zs}J7s&|p(#sq@pDI?Dd#4KpEUEMPxUiR^x1*;E_dZ1frYW>M z3MhT7>vj_^W^nn31_@7IjYM{|+aKm(r<+kTw$rX{fgL`ayaC*Q+~F{9ysJwXkQm!` z*K23YYsOHUeg6W?yxPMI637^DhYP_bML3BY#`%O{EC^sYfg8sXym4HGU!kbZ3teBJ zyV2g=F7AD~oa7;k6%?Oo;I_%eDz-SY-JI}h*j))z75i{B*d zI|$!Z@8EAELlsQ#18`SLr)kYrpw4U2(|jr_h#z-FjeqXL?gxZ4mj3pAa$v#kY>z5d?j===={}&M#FtnohB~IH8M?yY^)EVrWOfP ziEGAe)S(E6wi-~9TKRXhl+~SUBC9jVbbf&^mmgt=vuqXQLzKgd#371QOh)q*MIJH- zVu=?`@y{qgoyrL`QSo&+g_`xiKEnW_8#ay<`yC4&8Zx)Ia59gcjFbm>f`w@f@RE*RvfE%!AXtp!mI#vzy3<$!Evz$xx*Y;m;$y)q7fW8tdqZdgHHj3N-5Dk$ z7-P9TWQ-3%(jcZVYQic(&f#Dgrfm@X>#koJ2laTJ#X!I_H0S~LBrZ1yA4rXTg%sL=*{lc! zG~kTH_k+AkEdCVsh(!&xU9n=DVGkd)Br4B{wvv&SuF&#|$5966bfnrKpRXOVr<@oa zeoce1Jlho^Tf$cxsW_GKzpYs+37+Bu=ohk`$tE*UDdtg6B zq|e}xtO}{AxSC_*zJJT5x;v@g&(dx6WqTb>mfGLt65}5}7)4snRxE00{8(pMbzQZq zS{tjcYm=+=m1hi(F!obDE={NqxGLyZ$cyNDTjXZ)1aUDW$s+N~=&0hEkjaKpQMEPg7*UvY08?eg^^ z+~O`zI@;oQ$&V3aoz0c1QJMTYt9d$4i>JE$gT*6`{&rm%{9w4tK+PKXr%w87q>wP! zCOzVC@T<&t-s|z(jIs;2uGnhk&cLKge!ZdLl@^ve4Gs5O95-Cl*Z$7X@H=h&9}GL` zZExCI-CNlfv^DD*-))V59n~9rvhshzEUzQzqD_^Ztl*?fO;lo)ln4!zd7vl;$2&;d zt|XytCv-;xOG+vJf-i9aWn0o7aTnosx^FysE9$9{)t~Z9{v&%wFcOUg-nZ(sFdtP_ zHKA2y*MTibbKO_E<^pK*N7*HhJJtNBWn*uC&=OmE{846kqOCWTP(~rYib8by07V#r z+VCGnWWaBZfG=8J&x0IwsQ@5d-Afeb?XZt(^YWH)Uz^y~8hS{T48qT(DZ&rq9>`?B zZ|%bany0+M)|-9A;bWyIh7gd~XK@r&ZhfyN`z1Gs?;Jm<{ znw@bS|KdIA;cd^}u|={8th`;6)qf_X5KDJhkQYkpWOuF5`E1U+TPb;RgRUPEa{9s~ z(0Kc!ZEfQ|c?CEBXT0)Cvw2?GA$S+CqGk;=5Pi^_plf&Zi$}=jmoczSdesUMrbEHw zJ?}^keFP0tF2#Pl)6(L~=eyqgQ1+0>T7$itHNhHUL3uQMmX&Z5u%=j1@~*<*9!=Ih zGO&CuTmSx*w58Ej@v-Xp}B^wUK`^D7ozm=s=GZ))6gsY`K1%_r`W;zig`LM_?3`6g<` z2z05)7*z3WoK_csn$3UWwOpKgNB+yiS#)7eYx{t8JaopDhZ9+*q@S8|%YjaW!h@`O z5U_IqQGKPWr|aENirdASV2xj<8Q_6Su=u)D7LkuxTy12zvAEwMx2AhbolZsAzQ=*P zyFv&LhC}xuRt?J))DCxI{q5gr;B#zR9sE*Ou^F7)m?-{w_xQ7Xc<^^~HL4aTPfQv>u zEcO<+=6Ld@qo*gddb zGZi{0eiszUGd&wFKvzeM-wLa;TVMDwoBcW(Xl2F1+FGIbatz4xCLcO%dYurc{);CR zcLhJ^J`euT=~%tDVvpw9gQJa>9S&8e9Yc97+w<`<%Vrnv;A{^>eOrZz`pD+ zMx7&oqpd$aH-G=cac}DJgBdv7>b0}SG4slt(F_L{!yVaA5OCyq7<$o9eWNgQ(8mtZ zg7tp9i!^=FGtGs8TVF&t-n40`#H#tOP3%YQQ(wA!&-%%4lu!4LDtddLZ%>=)q(FOi zdTX7tx?r8zSGL1HcZXB=`m3FgJ$K|f^joLDcv@o*Y9=|lJD`_-U}>47VT2?EViHTc z;)g0Pe3JBaD9-_ym#F;5K-a)K0Lt^mZj4=LTe@zG2L1vju+d!Ik2X$^%2H7r14*Fc zYk#tFew0Y1<2V4rXetA~;n&b=Uj8afO>H|e?Jj*OTMAV?ohE=2E_?#Z^_MW(!!wrW zLXnvp0|j9VXZnk%^~e2h-=F4YNEdLHUkAuMFDmcj%@XJ}t z{WjfZr+Q||H(v~ry&T%tL*vYeL@vP-=bcMH#AjBqzGSgXuk6A!@~Nqe3sX+P&z#E` zc0&kxPmP)`_xYYqjosSwzRa?+4nEZdOJlYHT&&Gbfocisy`%S7Q5-)NAYK=lyH_c< z2^;Lm7^FttxrfuXAadbC#`*KHfzN#CRuXfG72yYxeI<0km1|F$OD8Nh2LWxghV8O@ z{ke`+Wi8eoE3({9glp&a-LyJ*`UD0u4cc+J1=4f7P^&U-`HosmpJSc2BOz1 zE@XCO9qR)srWivM<~33mo1U4`8XLic`PD!q-eZjEAWs@T{l zdXl6^HoJ&~HJ`X+;>Fl2%(`)B54B#ym^w%_9w>bWet8t_l{w+Up1HTrEY?UkNbZ@U zisDISC=Tw z;aY6=N3(pcwAGyRv=eQCrcYrRxMuYW;5fBqjA~Adw7HMsb8ua?ZD<=4 zphQeRE*9(KfySRUMltXv&l^pxHHX~EN=kxDqLp@AbN$z*kuE$vVrJc`N)kCg#MaN4 z@iOdvET6Vl_HQP1xIcLOHa<`x>i+NUoz$LsGNtQBWz2}TeWKUTDF$}kG)5Jm@>Pew zkQB<8-l*MEA6FtIC4=;xf&xiRW{kYYZU~vxuoJMVnk1FT_QZD zk{aDC-@y4$bHeCvmtK~VqWe$W?W#XBXQEa0a;g$x3EiZpZCb(V?RUUCQcqDwi=mow zqeStXpdpe=HggFs;KH*}>&=&S)k2o|yoKh&Phz6(@QW#APt<}}mfUe4vnP!ZhB()x z-t$=|+aOatXsLN+6R5b@6DD)k>$POlCi~}7b{Vx9*j5r91{a4V1rxze)L?F3+2Gf+ zOL3LBRZ5}z_fj)!=4c^ua*sQ!bcDJSf-NCtS3}8Tn#LGi5vfRPvwGE)EnCVRv^Fd! zZ*Uj~q)kFliF5f$WmsLrr`k01^> znNE?|wkJ+Nb0V@83fnXVuUC^VuN|Zb$P5@ulx1yIW=jP1TUfK&H$;@p4~t(iqXOV;81SUGK_+9>8fNG%mw}pqdsq*<18INCwzq{=gWKiH7=_KuBp{R z7ESCM^d1WG$y4FEL#v~Y0-eUZz$*{NAL7HJb@B#w!Vz?vKLgOzS6J6-M+(> zgQC~3MIJwUR>F(J?(K=&Q--CrL4I!UxcIu(QXAJDy=Eu!YAjUVw2be3m#1MZ+-; zWl(y04b6b69ew`^2KA*3&y;TZ+36C_xq^S5Z7iLuW6=IQ_ILP<3IMMrnsihglhO@B zwKJ#z1Marvygy`s)Z1bv5`*Nm(qF{qi^CP19fnBVorPfFtZ{ev7F1NPVcKR%xUtio zE?C^5g2@sc>YV#Tkdsa^`NBfyqw!(1k7qY{az`{oHnL8!Cb!sQmD8pbi<05TU&cH^p3beA>f{XK|Xr;2PhWqW@j+w7AqDBKu5l{UoE4wTB6}h zYE854!0rfF?c~d-&!+FKDG6wD6US8*68|NU{P~*t2EIxfW*+x7t(^0;#?@aj+^+*W z+Z~CtbL~4${_%Mj9mA7Q$GV?v0^)PAPz;o_UlnC-(b@ktpTF-4HSH?dEEC(mgjIP| z1^lpkRWyO!!%AmVekLfzz-8_7BZ7zpPYF^sd>v8vHL+Bk5i=myegN|g+d>Y1u$gl6 zLN2XmY)1Lb|NafeZoC#_PWMPxeLbB9Bdv4($@Y0?AMYF1wy+p z=RKK9MM)hhl_$db#h=(xgDEcRO@nK)edF0gGM;g}-9pJkJo|`Arro}(>F5QH9h_qD zJ@uC*83a4Bed!|5Km(2bK`-i_mvU&&6pj1w^Xy^X0B7`iB9zGLh2x}molv;WOO793 zajD$`0r1Ph+2U?+K95`I9|cK^{{VRVE?uImoFNfaRNODmXRQUZZBSjsFcXviV<%3C z^iG(MTTd+D^ASs#h4^4K5L`q3pL;iFsCc1rhVo(y7rT}PrGrIlvYOWaXY{ye2G|g{ z0=osYw4JU#ULrVrx~F%~p7heq(8l@?^u*K4#vxJzoy~2BGS=4pU>W4=@6TAX<_A8@ zGoWeW;~H1E^T39{xyZD~o>66Ai$dL1$6_~_m*L9eL7w!&zu_O7ZVAD&?QHj%Zc6e_ zR((PztCj`T+aTb=Njt|f@{yOdi$KN~C970@5o<1Q`GRuvdpjI(9zo;+I_x{EGmiA_ z-Iuv9nKsoeoc3KzXJ`E^9fXWd3n=-~I-c*+yGO!t0JLtobLY`h|{!EY?@^ ztSvgF1s(^PQ*qy=dYaQF_PHu6>c+GF02ZbQSF>^bdK)j5bMN|h%%0N3mrq~vI&*0< z5nlR)f~Vn@z45F(n16BM@&7A-e{$bp(N=(UXI|&=wPb%~fmG+qg_~?7pgr=9?z2|u zf~JHmI)ALp9s6U~i{QmGyt{n*8;=I;MeQHIC;q$lDgWHDGgG1LLK)sGlY*_nd41WA zvhG0H&NAS8V|w3maGI1~#JrH3W140Av|hdvb4AWiuk8ncW?F3E5sQs_VtO=AhbVc~ zdS&b9t-99H)+b|&Z?cU{bhA<tK~^>oK(nKGHf zOW6H+9TWE-@TlF#6VvXCpKTlj;pajI78aa(pdI^G<2bXk!< z_5K`>s5;rPwH|j|HM)r2Jla(oH@d z%?OR#zP8G*;U1A{vST%A?zn1npHgRYsP6s)9<}>;-?aPUm#IboI~KQ4BLkqaXB)x}8-Q)RO>^7eo-am&ucaB)qkU)~WJl&+ z3+-Ud_-ZaTm%+O7AAVeYzzW{#H`0Bse}h;gt5Qxw=u0}$Xw2{p5bTz6OqrzR`q%I4 z?@K+bpfY)aoEy|R5Vsf>XoRo-WnmfgBxsolT?dzyCanBVEGb3zSzFtmq2HK8sQhLR z75j!JGgZ3DO#>y@9-wHgz>;-wfmM}}WJQKH>lHSB!z(~Zyu)I8#dr!;_5`=6yjRko zz{{6)%?v-d`G(Q>1RuWC$Jf@&T5hHjYhFxB*3isvH?zr01GUnoj`v+j?>Ffcz;`_x zcrNT`^KhAvrHa9d-luBFEX(9XL^Jo*$k$L|t|Vvtkg|~ns}a*7);4XNR*k7qL#v;> z)iJ$Pp`2}y0H70mtqQkNnr3!F&+Kc_(jARWQ2Iv!jMmM; zX90=csil=m!8D^Ny{N&VbRTe1-cPTO>O6})g<&bVrGl>@Mxaav8X?x8ZQ?*r^gqHE_pH@RymdS~=>pt*u0v6;};1bhLPl(;DrHa z1ev3h$CT@U!2^Ot7I@BOA$gn%q6GwH3K>F`gSwJ0{xSOn2x@QLC(yhk$ikH!90VmA zWE-DdEA2=a&IIU@Uzb%8!Ctv`JtSAyo|Xrl06^OjE;sYoTF&T#4%Cy>-*6bmHPHYG zNF;QyIE%5Cqg&33P>bpU2&%|vs91FYggylNb~D!zfJ7og7jOHr7hAV>x+909lOv;@ zqla>`Lw8>Pl^@lTVuODHA+SYdCi3V6_%JHKqD=4E=|3g+Zj2IakKZtj*}04+opV=d z>Qc{@2$5CRHq3wAg2gTb|rnC;Jx|ZK7Fk} zv>B5HU%w;yEY{z3+SzTk_Qo>384ZmGoF-%a>q|bfGI(AG?62AUkk|O=Ux@`e)gwxz zr6mhH>Pt>uf_9%E9RVuTITII)tf<;i3`9x8_CdaaKxxh~Gl{N}?P;jBa4yS|rTWr` z>9lObUkMeX_T`tTs(hxddRe~P+NU(_+9eI|Lh>)+pFh#+CCiEB))fv#wsH z@V}!7dQjfx8nhXu3+b^#)xfT$%dEe=QZBDrguaO+r~&KTSbi2zKCAQLydQomi%GLRa^M&awx<`cwOtRbgL({ih!K%PLshuY8#i z6JGoZ*phzf_MZSq2Lu541SV+U7&OiSkP3a;75iP5$!kvwy2*f)#Z%qXRyP9U>|(zm z!g0(z!PnCuNyC#l%U_25~3`TF9iCr22?j>Rh z8Z@@pAOfT}WM>3FM9hry_zyK7wC&3%nzz+yoQ>L^hyZr8J6hn;cHvYE#X&W|h{i0? z+1tTrG;hz#QZAYW(#1#y<*k^$@l?(dxS$a!yO@`V$uDRioq${LbS*b*ZsG ziP{=Rfg!pOrBW)TA{F`=5}|dN?JLDDo4v~RK<7|SJ^0Z?ZZqr04z95texwt%#^u|Y z7p=5-6U#e|0E!#HHxO$Q-pt~YK=yW20!^R^G=U}zC(s0%z(`}*jaJTbuW3~@qnFc$ z^E3I5xD_48D*PO?#u*;9D={ofmnkqSdSR&=6ue~fz?KfuRxV-fAREDOvNgf4vKuGCMzr##uAfOdT1^X=BIN+QT!E}?xQRB=CT$I_#;NQYyo;^LpdjVeW#_)RIJ#-E zN$jK5ffOkI5`DRM3sO9c1TRYL$}uuo$U?a3IRHSQH6PuBz-Drx6a~&?HsM z0@X{!!D#C#XLqgvpgKnIcI1R*=foQGj*7`Sxk=bG=}Xv6&dIyP@2j^e8r?(YyJLlB z4>x27=42oT&W|koLC^H3-!y$PrIa6WpIY}30QL`CqUp-(W|1}$Njw0M{*ype!B8+> z&Vl=LjQSy8tGf%<@R3Rb#`GGPD?!9C5D4s{L?1p5xR0kch4Cw0E>W+)x<oe zd^*tNwD}Zn5_5VgT*D1LRLzhWa}VHlASzPaHo85gd8Pb;^FKgF0Q)~5U6=GB3Zt4g z7xfc;@3*Gf_NG?P$+UeA%#A&}&S-xv?=2}GO=}j6-wC)gK1OxrK={k=fc44boxT^p z9>$_sgV>Y#k*s>6=6x4rb<0HCisx_!!HeAR{4Q}UMD~ffC;$;iK4G(~+yLxi(NY?z z<4sPRp>X4w(^WVidBB(zxsh*HQ>Q3n?6mhqzt+H`f$@3+_xA$(b*?++$46owxdc&_ zlE7uhYNp#Jhlqs&|09Y3=Hd_$Aov~F27XWs2$l}~)hP~`^8x2CVE^gSy5|8H?*#5Y zlnTs0fcFM4ZUgRH111^c+@rbGL3NxN(XH5Z*jq*pSnl(}cbc~VqgY(1q+l)vU~`T7 zUp9HPH44UT0JjCqxmwlWBkDc=5xNtV^rkDQEgqn>>e&f1gng&5@&@nch-TH-=}?KTIb*US z*}HbN7+nvHl?R}2tMeiS3mh;9;y}G1g9&ByB_n2dgS&vzA9t1lqL+647LEA*k=l=dI`MqGPJ_%u_#|ruF zxkvi;xfb(ceu%96lAxI0)(!Vz!hm|-Q5h9C`F&BoZJ5~W2;7|s`E4=Qlw$sGLA{;d zGOCH2uoK3dXKtYnU+3O7m(jKw)(?dV`3hrpZKnX^`LaGQ3f?6SOTMJ{QKH2FBIe#4 zd(rz_eDegN_d`r*)Bwq6?&&+(!FZ#Vv+k|3F$hu{i}9gZ=nl8KBcR8|M}h7b!Z?eY zstLHU_rTKEJ}E&44GjwVvdB%4NEQiD4a5x-Z61e_g71!2RK+5k2h-`Uwl=V#FqXev}aPC(J+C4?=#n5!rBk-|Ji(Hw_?X2&xm~`tZ`!N+zcGm(+((X?`B~ z@{kLK2|C?W1Nc_({`mT9$jK{W)iHkx2C{{dl5!KJov9|9YqHFyDt5|MJlJK`=>MFD zr!+stq6ff*_`e9e_m%!PG*$X_5cV8Tg{`qivd{!w#y{a?ivIk5JN~HNkhb%$96%Ah zA!Z3(_HTIs327?oI!{^xVN`IUtNZgzd$X`jp->xk7_DTPS<0EU%>c2M>6*K?o zhv$CY`CDHV`^hUT6PFAKw(FbGWT|!GTrCm*q9>?mUa~eo*=DDa%GV_Qk<&xxin76 z*iuRAye=exWP*b(CBrh)L{SgPJ}nA5m{dDVTzEE+(irSb(XA-EDw+dDAW~J_(2n+F zrO~~u0KYgbaV+mKdmnd=dzrJwU?^cWZzrJ)?UoB@>|jV+DY?c7Znmw5vlX?rV1OCN znxQ(|7s9b*Xu30B)W+mJ6bcm-V&GUa6qh&_ssgG&6>v&Bk`hg$Q0freS!n0H)IpvG zO*!!P*#0&{iR)el$};{D7qx-fiKPZhwfmO?58ONf+YhwU&KmTgP@X%VJZsQ)Zg=0` z_dSh5vZZBS>jDP$_xR-^>m&P+L^C=yi#~k3G6|kNWIEbd+b*I#Nc&7sf@$#5V>^ph zcb>wJ*}o9^jBl5z>B0kiL*9Vb`v-Uxz&2iRaC zC8gz(;`DS8hJ8J+J=@YU$u7)5vvp1*E9=8`4D$dH(^M5GDw#Zq4a~!{KEe2CyukEy^_y zc=nKy=wVHZ*R&Q0OOtHDMscIZ-B6CZ5a!~cdVXqI23`1|$%}rUr(Pa40ID2-}U3+#fJ+CPwM$aZTd`Kj;Gs9S_zGQ zy2uaB8in?BRB74o$gt};2$0RJqfdp0)6kWNuc7vt`&~mX!WRsk;d?Uule*^gmVBJa zWeyf7J*MfSFpSW}a2#r%8Hki#gfWKBFiAW8QKhMs@XBko!l2%Ew?P+S&9ugiFHiVP zxtGv%<;2%{sn+)CE!*adRvg@&(k0VJVHlw+567YQnE{RIMHpk~43lK~8*=k6wi^Ik zR(5?02Xn*b_zWO#<@Q0MYTHc)&2yA7d>Wq3 z>kT7M`iGh~#l6dC@;*(A+tTYP_ODd``bb#>;TKOf#>gS};OUCP^1*A8s`}BR!SL>NXKBF%T1oG-1s`l;SqWm#e2J3^_Rg;`#Kl_AdBuA^ z_Pvv`jh~@Ix*QEL93m!S1qzFd7Uvjejv;r*S;{(Q*&Da=QoC#u`H%}<`Ja(BPaE18 zrk%d5uwo>nXktOE7q5w09cRkZo!eHhw(Yf)*X3^aG;Glm{r>qatpc7Z?%YB9zeKr${KLkug4JhCi^xCMEXxPdH6j zx{z!UBt1nrlYgX`gZ!_6)T1I1$wkt|gLq|Y+&Z?=&2E$1jh&mQl+kc_Oq!pO=0k+;3~=SV=g(do6k z6?#C{ER@Jq)^JUW{DKIsE{2mzE`&&s1;IoTQ&KrA6={f~es=UF652ywz`=)wfCVXd z0fb&C1Tx6M3p9k_q1Z%=H;_;fqJ~o_fr5rm5ye&3Mcvg#4>i|r1-hrIhFa3s>M3I) z6K&pd+p|XNuo06sWoyQmVTm>DtW}IN)q;80t>@)=Oa_s}tk@Ej=oeHZu_V68y9#bL zR&rV%$Ur{65M8#4GWBQFsnS%W`U+K@>e>>%;VXJxe=bMVH?(dR%r$dvw#`|)YOht| z)_y~wWmX;T-F@`C{LlH_urquqoCFtc)USOuxQ+byqBikC1+MbgiuyVOY+5}Vp_ zv0CnOZpbXZWSBcmPT4QM?3Yw_Rg(qP#L2o8CDeyHUe9W&zH0~Vwmr8``V=Ek_D$Jq zMr2ABWR>iHxBNNp7Q2gwA}k)1u`(!+OIBW#&nEHe{uec#t z)6mf7LwmT5?N|GwOY45RUwu}8(qH#KhJ@jKcstyV-yZ*?9^(jNFXm$y+ zIzgMFy$R6|=mYUUXJB36(vaODheN&uwZV?y6JaK_j)v(Vous?y|AqY<_JIj7htQ+w zS#%El6e@1k+Y&ScXMR_mhbCqA_=Zg6vuZR_~xG&BY zR=Hco%e4}eudB`KaPweGO-pQfY5iL3%{DR#Qg6oe#T?Pkq`bt;$9#OYD^|PZQh{8~ zCAT)WL(nSd5)28>3+CdiTivb0t(V80j$4d(&U5)26YzzS!gGnfZDVaSiND&7?R%3( zlKxJbP5RWK@9=lDc5LjpOgJW-6@KdYtJBlTb@p|>mI7V+U6wAsYfabTRAP62&!QgP z9#xOCC)u;1=W@}A=(^}b@3LNFZ?bnw?{jHzt6&lkAb~v)fm4725iCFl^j=$y^-}a}W6Z(S>i-8MXxEm+q+W02k zjLihn{q;Sse)H=8ufBYBI8DX})$dvJM$Q|R;DNPGPkf$#whRZLH$V)D5J(0PC1#L@ z72JyVEN4y!t8_Uq!QPI!ha>0lQ|8}5J|%pmg5`_Y9li9I*X)0*Re|g9J@Bvr!t^xC z7zpO^DM4Xw0H-~$bXqffcpt>Bu&&e|n56hUQ%_Y4N9YJPKj3w49cW`#v_SE)0f zQDqXWr+rQI8kalOGXy%wwtihkU-01ltRNJEKr<@C19J=@4wUyT+Q7n7Lv3hv9Sh>uaw;K!el(z5&FI z;33<(w4IS+pb zD3z*!*CAbuh!aQIoNUx}vN^>RE)8h6Qa`Fq`sF4TYP%~UV&UO#ULdwhx7i`228lK1 zeO>yCt6{vb)S<|vKK=CcKXEe38Ks{9EH_Wk!uhF)n+!yqyE|KXwJZ*t)GUBM4k}FI zxG#-lQBZAKGY0NNMj%Vd0{$H0X=@E3IV*8zAr9fra8OI>0fe(pp=)hft&V z_Y_@#*A=Zf9h??3y!{F?1s@f|@N*0Su7R87EQCh*3GnE!ltEIH8smGjR4t!#Hdx|f zc)0a0T#27?>e7y%#)D%BZqbeYJQn~H9_|r&T$uS7yjDWrn<>Cxw(<$uT2?cfs7%Ok ztKR&f!h5*`A~1TX4lZ=Hu`lFB-e_5n+l9AtGAo4Xc~CiqeD~VOc%nH=S`5CAv;t zuj!VU&RJ?2;x=h%hVm<25grZxaei@b%rI8PG);9vb~>lFphBdlwj;}bdW9T!KIza3 zYs}32%m4r1Cnm;O(bG=ljH;@Wd_{dsR6mBe(LmHIpI=ei`4PV)CUm?kLk8JGIPsBY zei(tvW2i-~tuUbmT9c*PoYEaVAtEhS3uqp=gsA771bScK$n(zCOw9ic2S}Nu2f$NA zGO^!%YBG8?jDh zqEjLcB=Rnq5+y|xPfgtJLfZNK%y|+UbwudV#bVLtDA0pmq=}$qt#5}k?fe>Du@z~C_{45eb zd;wlC_5}a?Q2fdlFvobsS8WrAUMk)B(xf(n)Pw>S$pVDwWuOs1A>8(SBWR@9lns*o zBEPhkCYZ`hQ&dp~zujd^U4+sRv=*N_h@$pgtaOeU4K@YUY`{vjk>*_M5+q8AplILm z-*ln~f^W8uA@wk~f|IgMoA(YDuYkZ8GvCH3od5OB>5D^S&8frC7{mzh~TEL1_f zi?<6%D`97|Cu%nM>m5r;c%B)4#jCyfbSwJUG2E{<9BUj-a`&%$7s-PJFb#UhRGfMr z$CvdV-d_2IV6@SQw}HnuJrvdi5LVsqyl=iq1fw@6!6!-h-_A-23_YK)P3#)#Vi-l< zUg$so3b>fn zOmAA5!cOC~1o}BuP zd8ZF?^Df05|DY5nYa|kQhH)i4p1#@U+ne3tsxvE1PTj6e?^X#UkmT-{R&lQjnaY= zB?&3v>`CQKiX@OPv4P5!!ct)ITe zx*M4c_Ru0qM%4$L%H1C*>`aa?(X1!Dhwz!sKQsnS&@{(kq^`KY0W&zK((Us|F0LVZ~{tq=5Tc<@v;FDT;THc1c7cz!l0c$6h$6ejF%z~7W{2;7I zkR1t6e6+-C@W^5K3bCM7gF2}WtUk&{=ThtcC!rT-TJCIDR44DZ*FPQRm3W0yp=vPn{Xgun@#>zGzZ9I$-~X5Dsl`_7`CnmDKvT+i$|dE8qy{GDRo>&?Im$-UDY9a%UQDrTz`1*fEA6 z7SIMS)hKB ziganiF_^qsGeq)cjA-bcoJPI#10#@<8+e}--JqA)8B;3g=-2SqDw1Kw_5=woX9!by zoL6)Wh!E{ATS6gF*TkT{ZR5TUjL`4neEQM@Jca(eok#-r$C;SY4wj$YM)= zq=kA#&nyh$i|G4Wl?a)1mXr=gkmABV`_A$YUSG^Q7&jU3eELChHaQmf^u5U4js zqpyNI@PHFoJAqpRdwBZHcOwV3?xz0r=TFK5B=GzIzSw|FN&qaZWy0G?XLKro8VVnU z%O&^M;Iq_>I}l|*eEow@Ezy54DR!v+s#06E2poZBVm1@E3U$|bJg7W(Bc*ONSMzb>kP8}gOUYkH-F$Vi1wreY93#uYqiMnN1$5~iV?SKZc? zY&pTfUd9<=F_WRFOSk)5ax?RPyTVXF@x!O2xUz;YmZT zHUc#zhvHU@oV2*Vo5V44SbR&mnd%t21b1UB{;>e7*|Ppg6s`kDST3mg>RBGt!HP0C z(>xF9yHSMTe4Z|vw+f8QVXCg}2GtFC&=aU0C|t=eoekYES@M%5y#7fAYOzZkBG|hA zB=8DCq8@r-fRECu7lqY=_%+&V8gaGNPjSzf4k&Ha1w_oJXdyP`B$K3(@QZd|^2xLq znWDb2@6&Vi31AT*wj4_>E-IU-!h~=WW(o38ii$JV(J~z}+%GH#;CKLgs+=m$a8>Qy z_@?P75EV+N;s`}=MEPQZ=aTG3ob`@_X!?-zpyPUbRvQ=>mn56{aAj zXM@HO!&9UwdJ#{YT0^u!Ac93V zNG)}eVLvhy{q_>P?y8(RCjdtlLqIOu+`i`t#C)viK1uy(X!>o+Qa2EiNt3&t-RlUe zz(7Z<(Z=EFlt)*IgkkBsuK8_I?~VG1wDT;VSvAE0PrwCN2;mDws%Nk^5iGrV6uP!T zPl2W}*h&ynw-UwBT?&W0>)pe1Hco12Y>n&c_L@>KG7W|2b}K+J7h$a(r4tI zmIcu4DtR27Md<+0ul|k#bj{Zo4LX{}RLLe;$FRf7_xVz#NzH#ez@)JKG`P)747I`w z8{wRk8lX9>YYYYp5VhQPjx}L3CLsl_`8i!iBw4p=z5Y!C$G=am>c_6^b#F!~_LbxD zQ!6-#MlP)(O2|8_2q5nYU3LeD9VKAmxzCu<$G*SLz5BxR6}i9MSikD}D!z**-tEq$ zeVh3->E>InV>Y}Kv!ko|`L~!48^)}s(y$1`!LKU@7+6Hw&d&siio=RLp9vL-=F`Nb>rW=6f~J7# zcgsLG`>L;N2hab5-S-Yl!F0VUv9S6Z$F9UU-mEj0&;!*SLNoVio8G3{S3dZ{b z<@{l(#c*TMOBLymRNf6#Jtdr;?{=u8wsSsyGb)Mz#{)F-|tqsNJ z$k2Xj%7r=EL2WaNLy9qdNcQrss#1dxGqftz$MN<;h#BoG+pXsRzSC)?Uta-9B@~ip zg&ygVHq4-=oxorz?ksicS}5b$N0s$ zoAim?1?0Tber6VYAf+MqlkkqCVH4UYh8-6cyTL2m>6<+Wo%61PDoJQEG>rEPnPC<0 zJGPvz7QS1?IcC&f9K80ZtX%r>DUB$Z;Xa;Dvz&laISqk<2Zw1nutPR;L;s#Ce>%xP zTsJBNE2vnaik)3%c7%u$udXxitDe|jklMyoBoQ$>!m@rKlBvkrjzw8X+(g&S4CJ?X zx|t9>Y>Ot>v2cN=Lopl+wqm@_@bv06?1Ejn*2s&RmIwByS=kFcg>igIbuz}e=(r@g zuSUG#LSL40Regl6dou)-+cr}8hVB7^xSFyY`%Saj$@mKs7w))5tAwhG<}z>2ZWTgU z>-{gcHg8t@{g&SRBuw=+;b^9_(19KDvh9VVu9hQ15c*4ehrQTQ;mR00Du*iP^hCkys~Nd)>XUDrdx!xkD9BZU zvZh#QRX~p9KZ(OEp@dROi0WG)wGQRY^qrL$V8`07ddH7Y{ZYKXhb8;V(Hv;w)LlDK zJ@M&AJY#|KGZ~C+nZ&ds2d$N{Qk#mP90G7LtyKp-7<>(cChI|P7V(&{!`$nV0QP!u zybR)gW6MEn4RG{Sd`etA1K!R!OT==|Nt5t~$%Yu*C2AT{AVp+#eMR`@qpiVrFUP2$ zg2!nPBOHVgSOma{@?B0~Aq8NCOF z5Crx?Ek?8)q)J2NK6c_(L^fy$M}sPb_RtHbcZ!*!Jd9mS0!Om8C0PSP%s!G_QAkT< zYSMCc39x4e(NCcXR-bDQ;-^G|rUl)$p%boOvW+MXCuV18Q^{4$0UiXEsB&ROXVc-% zF-E~eCwAIO42ou|X<3n;rqeka9+ab6IVd@vZh2m7gMAx?4MN98Yf6NJa(5W^DMi8( z6s(kAipix!>*sC08e-IZXm~JWbrz*21*?K8h|r%^Flc|5GO3T3igGvisy_-62OdWM z<f%afr9M5;Q1{umIfger{fY+;iI8Vj#n7rU}A^EPa)F{nKRUg5-YPiTj** z{Mq>@Z{M!ff0=?bh=vg$({?i%Yi9{Jz#yt$GY~yYe>4+&jqlDT#Z8x7jL*s9kPw9G ze-SO6W9`@tvBHSO#f4p>|xwH!xUUsSfvya+{r*`F?>w`EKfn zAn(+qdfwk(kRIQ|Ri!&pcGXCU4ExD)2bGm#V4oNXAsxYaM+q3*|l(GPi_jU zoipY@Hx$}drRt_<1x3Pm^gzg~gLyZQHsBSFHX7w>pfFpG5wUA>+vRetHj3PZh7_Nd z#nGjOIn%S(yXvxJ6gilQCzROUb)RVESM>Vq)jFQ(@AbEBIF$L23e?5%dHHz1Y(3G6hle0F^K-EF^Ta=h0$8) zFkECJ>}C=Ud{DyRN*pI5CP@~Wa3`max4e_Fnqc3VeGEQ)GME9b|@g-(jEqSyo=l+XwG!?^H2C%JI_ z2}I@2a6$pSaTN_hvsp)0p#9u7`AS9Qj(1!MdN9z45vqfnq$4)^=0cx}8Z@2uJAhG3 zjKpZh2}o|}1y4Rs2l{-jX<>3-2P+r+HFK}@wT0|9DV}}?qByY6o*%MLiojUIq z?$4ehs9e~G*(tDA*-_Q?v39tsHXKW~+K>)*yU4df+1+3`h_Cqf>;vEMwXVEkvrC~h z%7rmtro~!QG5S^7S5*XVuL4|DxW z1>#@C@rw-jm>TnYCrMJUJ2|A46<8Mc$F?zv)6vbLtlU>*ywv zy+xK1&gi&!i&lk$h#^k zC=_ZC5u>_KUuaQ+0#SWr-*OgKuwo$keQBY#~>`hUTjyb-xCtMq< zKufq7WqVg1eVI8C4lhmdo7kbiHEJYO!S>c%NTDNAJtnW(!WZB*6+9nmcEL_Q zY1ULzjj6l6cpyXS!oyrRK~OEBH+1$3-{q$fc*;nHU!2I z0B?hG#I}sOYxh(^BpOKXfT{;J_lSkZwf#dqR0Z;>&TG2c?bWyIOx3Ef4O{uEGrQ}q zWQ$og)?qQhipiXZIpar&nk32YTEi(GdKQk6ccr3G0-S&sq;AiZHIJSZG@C+gAwF@& zNKQl{rIB!l_M7uuBsL2g>^fozS**4-pXS%0_=FUE9w9tQoYOVsWMM+x7@W{brs*?O zfS<4yg-~sWD}^tV`Z7{@l1Y-K5(}?UN2Ee4R3L2!7ix3greiRK& zb!RvTVI~^;HVJ3ctynEjO9J?|o8b^5wUhZ(<9VSu3jnr10Z1`&5Sn^d2ecs!ilBHT zx21oU&BiX_CBLxvhcOhJp^=ssSv_mjdlVvwO*Gmyv#*jwNn@%Wh`62x(d=MaO_(+H^0>J11HO6hU(i(Z`5->nRY zkf;!(;aN}YCZ!vr&(opA=bnk(63bBCGfBnbVWA57FG0wU>qLTS7~tr#EdE_oBB^xdbf+J5`RO4kC!7 z`0!y2v&>3Ooz2#21w~Pgkx-QKW^(^{XeitID1whE{>`95s-G<298)hR|G;Q8vOJ8o zqr;o{mh8yge^0FEjIc)kTL*s;cH9EL^qPxpZxCNkro$~2J#X5h0FL;L%z54Imm+3^ zJ7}bIgA1lDMJg=Aa0;{x-|eRY;B;2ya!2liHt`gDp&PAasA@h-h(^U2=f==R85()C zF5vgplqlz8ztF+(Lg9G(AbzNQaP5;t%V{;|OjIT0LI@f{m|mqkrES-$t!sC|1vY(* zHgN8B_4KKFr>F!zNrVlE4^Vb&OQOjXJks%9ocgZy8;$7iq;xeqbLLVAQ0p`Cuz4L2E$@BL6A(A$73b+ ze0}`2)Zd~j!@;+cLe@cq9vlHU^TCe>yNcUftS9WA1?l@z%lAj0mJ(fjlkD5MhpN0> z%raC+Qn^$lMD-|ntHBByAH#VFDbAQ`e7|B1G*~Ik$HH z%#~P!*PsIUyt&eN`^(A|xpwdWHOg!k&n1}t%DmlTnZY;zh_nU~pi%ZWH`muzJ`@Gy zUOKRkwm!I@dKk!q*T+DeHTWo+^jr&>w~yaAe06WPFuT3gWdJN7z>Up!_s`Y6U|rtz z{⪙;4fF0uV`)>KtF7dP8JC+?Z4h1uD}1#o*gd{YWTxn6vgLXLSE$|2XgNL=hgpI zy=;~cRV?F&U&L?nEPXF~8i9@IPrTSpA7ZesTrrta=Fa1iUqq?#@bQh`V$RV%iigp# zf<3=JjM_Z~zb84AS=2Wcu~ zXQ(Vv>|J+fV($2c053Xwy+o7{w|AQD0={QH#V`b8-#C8wI)7{UjSjc;W{t%iSG2_! zW-zFaH7wN(v58Y_b#99!8nWwl#M@KT_(|@Yj;01^o;*mRDI7-lfE!jXRFz(5hMs5e zrh{%enwSpJ(G2rgqagt!WftHqa;pX)KH0!tGz6O&g8G@!DE8h{hr!T$5c0?*@=LEPcESkiBt zG1Gp6tC2F~{o2eiaVgOmj$;6~U|;%w`Cob}$}Coh$(s1*SDEAPC++&a=t@gH`TD=D z;c`0IcGij9x}E;fjZ25k>*FADKdqSqYKIe>WOG)*W|_v2npRldrqu86nCWID_(@(w zGk_@SMl-X5{jF*E^=5>og#D67cX|Bf-zC2A)Xm`fXDhmW12QkaoYH~3?Hu;f|}o~?}mnvtgcbVZRjtekE>-ZXVuu162o`Sd^uW1 z{*15NY6X|dJU;k9YGcw;$WV1EC7Sopgz)Lx=iQvZjFAp8g%zcC3^m5OAE2v4!?L=X6IWO z!jIliXefb_<&3x6D^`yXQWZkK?!+Hugp5_cj#39+r^HBt!E9Qs0~(DN5*l>51I9l- zo1#_|&C?QU-(7jT_$eP&;9KoD33aP75 z?`85X6lW>b#0ooO#vwYOoRCH{*05K_Vsl;IzxzmAo!QTdAZRcfI z!AFh#?^gQDlL)xM{mJ$v`zI&+ddG#jI@x~;BrBmi=V+#0IA&SchT z-|r@Rv_A9MS1TtH2xRvXY1K}@vxldHsVI<#u5D!gyCAzBcMSK&TW-hQNYER38GLyO z<#nW$YOmc$fBxBe$hm9M@~TFke9ZFU1HEM`WAFkz|6|ahR3ine7n-K%=>X zV$eY8!^$p4GwmorFf&Tg!K&YAQ9PKD<=&!w zrl}wh@k?nEmAsx}#nA_)7*(yOIxcw{Vt-2UE<0ZSnNw1?R?-X5?scHqsYxSlv+{j=(S7_u~usU&uU!9w?!MvEVjaox)O=e<-QLY-l7!iGt97cCzM|To5U1h#?kNs5jZ{U& zu%aw+DO3$=HXzq2mO#^u^I9Lp-x89pY0(`lDn`K4u`*Gs;`#mH+k7f0mO@7#!W8=V z%*#G*na#vokUpv(cDk)f3Em(-VJ1MaKh+d^bGOQlqr+5}`*ic!>A$@9IQfO}+#3F_ z+n2WN$D40=eob8){@YW#>%AU^PDsWWlE;)t=tA8>Se(RTcTMd3-gd|_3F!G?M+kX) zBJEgt+7zM`iD6_x?EoH+oSCAS@5EQo2uEq5DL;c;LpF!c(BV=H*X$* z9Fm5Z#>!WA|FLdt2x^G_$ZYMbe(UcRIzrgkAKf{&BcwQmzA8PJz|d);Q7z_ss;W%) zZfS-q`gyFf#sM34RI@~f{EatJkf{%XH3m$pCmhV`2tx9z|Bsch38q%jUM?Q-1 zoz(wJjvpt|T}P1anx+6(ES+RG04{yuDI4bKapCAivL%EsKqVuKSn4m!CIs3gN#RA$ z1ZoyXMa-#-2UX5P!?AVY!`t;-O)5)sp%2#ZUE|3e@^1K;hbt8!L21N^qNrygV3K6! zr?Nf~^l<{}_4_sek3AodllzZ<7c=)$Bq)5Dh`>>fTR&;O>h=!V@ zrekgt&7s}$3@++LsNS_I6G4-w^<`7Zbxsg0WF&zh@L~ZCI`Cw=9A9r)JFfS%=rh%E z!u6RLFU?%9TW2Fz5kWFl3R78|>(@m;6eZ+-I;d@|ZEfsOc;jmlXcwg3ZD>MQo%%WU zzVl$@Or&4!zFm8|__nr|zE&*iOVx*7mb||HA(Ya{p!4MAzwbPHw7*_J(nBuFx1l8i zZU?hGq_-#;dp2UsSOAswR4(zBU`FA<#ZU~(!mX}>?Ngl)sS6E@Zd$1y|FkYrE0)q$9Ymv(O zDUmsN`ua!N@wW#KD%f$0HCxhC$MQ%YCVl+86U=0BLK0=;Mfn(m@9v-VKvY4j%41>p z^`npfC#Sg1|2mEqg`PBvj&h6cdI)FAX!siTkTSpM~Btq14bEnbbmX5rj-`8u!h zy&d+GE48C4s8VXvw~c`qoc__?VE2w`{lpyX+5IT^u>#xOUX74Wi)v}V=(mJKnij#V__@8R948S!+k{gqF*D3sQEFur0{anY6zC;mZtBx zmNdajn4+#m`#E{#P2&DcE3Dmk!%luUXfjV5D0iBT*q%l+t=E*$l6@~w&O?x|nMXc- z;JWoiuuYwydz+HtjC?@;XmUFBxQuC_kKd0Js=+N3Bc_v1$bgf)Y=3>u6ld~l4h@b1 zR~-1{rW~vubHKH3=^r>*eav>VzUm~icYTl%DCD-S+UEA-J5L93JbLG6n`~be&Q@o{ zLFUGHCmw&A35QD=f`ZOsJ?%_a(gfL%!!>uCiY+sQmjLrC9bAiYT25_Up~w7DL@vEJ z0_FVGaDJko?{C;^rP(gMbP}ZnloIyg24?L(swUPuMO5|PKE`E3T+^k>>utrN9#vd; z^<%I@<{iggKQWim{2o5Y(!M$Hl_EUN5?T6e7vK5qNu{Nv5DzP);Ho*mVC zby>HNo;>Sv|*4u5VD{%l8&lNLh=OQa|9ERMv> zvsv;<_+vl|d+7rx`BwF*I;`P4+A|7CPVr)H%id59^FK0IubghbKImJ&4h84-LXR}< zg?R)gH${}hF`pzev$6sE{9UGU)W6z;cl#ijl}4g*J8aZ*c)*;GQ)O>b*Xsk5m$Z=c z+j!OY`Mu9equQO0zm1HZ^aFVN)tuv20R7b@@~4S_VsRW zuh3S_AhK(!hcCGhUe3O^YHF&xMf$XIb@AW-*}UQ+*Kj7o`5T`@>}=_)kr2fNjD1%4 zNu+R_)Z=n>)lw7wOdrn^m64mXMp-RA4os*&J?ErweI%JSRE=XzxEIsf*#@kaBEj2g zx6ez@0qTV!p`enx>{6;tB+UV<68;2!uLSw`7jEUP6ylJ9P4hJ<{#M&CcY}y)@5LvY z1dNcEbaIINaOKhAKfm#L{5E!Cxzj0^$0q$Iu#wP{7v-%J{dVEO&a1Uwl9rtR>l^QE z6GX8%nQ76)mw}m zylt)x<+hs!X{{<9DhsX>FSDCZ4{Hzchc(l@?bn-AzRZp}2k(#H=nAj8Xoz;Uh(Q<`|cz5lvSZGL>C_b{^cD*H;EcP zcmlhRhv@ar)s~$?dykEe$MF684P6LSN`&DN_s-Q7T`L$hduE-R> zEYMtu}4?3Z9zikk| z)igQ~lQ+&q)6%=Bh%w$N#~}h19v)x?QudJr0bR(Rp)bg|&M_ zwTpWtTPd7u6ZU+Lh7gnj?eIv?@-cGFbJB^^S9TC7zCq+(Nv3t~^;Dxd(`Xf7rs;)N zW2U(jCCxim=IT1r9F=*i{J@Rty(?`maj6uXe;nfOOe54GsN`&wSrKa#_l6i?4Fuw@ z|I#uzbc(ZP$1phXW1`>$QIJG$w#G-PAj-0!D68Ubh`Fgd@rdJvmI#6UVzb%4CBbI< zD%ilr^TXjbXf2{?YudK+Ufr}d6u_a)EbyP!%JQcvBZ0@s)x$YHe*s#MkPka^4l7en zNUcBDBgi}LIqr}s!eR85LOKlFr6-_&KcN$lry)*Sg<}8aI4pXVW{hVT?F*@CDL$efOEQ`*r z6T5j|;-v)CUbc10r-IhZzq>3`!jxrHxh}V9e65RT6CP*1(L5!9(xTYQW>5MI+8Oo! zfB(+>TTgU%f2qUoH?|*H6){!)8^5*|$8(=%Lr-TsExGGcz0>>MrAe$bqAe{u=q(0v zfz8rpSYRjYvG@UICk^`l02e6r-BQyHP>wz4)@(IL%ssa-O{aJ=A*PO=(z*7r9U^*O z(6-w#(N1Ap91`=HMbfAc(O_-BraMu=;aA!+GmFo3@M=RDL(>UuG^fbrY2~vga2v}9nMDwn(l{4!!5AoMGfxL1&jfG6 z_l|&U9cBmbkTVMblG_s?F` z?jp1ni3kpJ9EtS-2%}x_y z0RgM(+UwB1a-}EM^6p;idD&^QUMsyleGg*nN6lXEUf~C4Z`Ss`eKpgfF+2rNbG8ar z|H$){PamJe;Sl>Xo;Og?qxMx)K>$u?cMcKagaVAqa&xwXdn5@Z1sa|Wi9sP0k-%8n zxZX1tGb&fj^Nure*n=PwD&Y6m7Z4?=RC`KjAvoPuSSej!QCPV32H%HNzra?&e}NxI zyrrNXvz6Xs_A{)(0h4S9Zb;(;CB7=+>0Gs6BYN?*yXKHS>v=lMgH9sED!?x)9Xhh< z^i99ud+&{rvEyRNbyU{cq%Va5I;RwSBEJ>oOd5)gloD5Ho|W5@M6G5qk~~g>2j(@& zvr47HI6b`58$byE*H5M$ECNR)Yy^t?DsFcaIW<& zLdyjL7P@Ttor5s8snFoBuebdPfErKqj3(YY5vS8R)kR-Q*%22#YgWm{A(UA&`#{6K z$|KN3y;Ng%8_x8k*JPx#A!m=}R_n32nu44fstmiurY3kag$Yug0Z!uNKH40?)4aX5 z@AcfCx>_`lVG1-5FJ$Gc>rA8DsjKh{L@w9^HWM{#qLh>%law^FIpz$SUv*q+I6le; zv|DQcyg;fJgSJ@a5{X0DmZOLipi!r0bP(r*)=vL@pK&Tls)P6?ju6FsGS8=cO$| zSnei?&+$N~?yM3t<$L;CpNA8oPTD66bZ=$lOQk$VHk;sO_M+iGQq8%Y4Sq5WaJIb! zuOuhnUDE4>wqo9-<)QD=XIdMKqs9!}JC-LhlAB zMct+NRJI-6b$6|81J+q!S3l_#!hHuG4i$S_T}?0)=s%HQufqaRZsiB9CzdhYkgVWGw4rMhPI95JH-(2-%tuy;N}tQj|m?u3<+cM-|1Gzof*_zKlP0=b&a_ zTKN{{@)Wd{&q|~w>?;Zr9x!o@_;N=emkl0QGLJ;oz(u9?@aDJa86Te8Y^^Ehbh2&N>n)X|RWATcqT$V40QTAz?>^&4B-u zQlzlf5G2$tp?a=IcmIy5I$`;2qf`jGR)IwMbwcTYGNIZCSqM*vG!clI zq$8avxZuf(o)tKt(SgU)%~T=}r0C%hw8n`r77IZMAqZ)NWU$1w%L(bd;=;)dKNJ`8 z#`6V);c{kzy%3EFUP8D~wz=794t1-K?FqZ?KKIB4*uHy+LJ&4}W1iGUvuO=~74pGi1b~H0lW|-h>7Q!!VF%URV_6|EkI$t2iLx_Vk%EL0#y*C+PKhcEhbr2)h zaC5ZKsQu^!=R9rRHs_py&?%=Ag0Z>i%mMH8xsWiiZ;P^g);aKz0*V8oq9XW(L|^$m zEEQUf50cal6te3UN+gcyDL0m)jn(pX+x{up7#pP=3(oV<2h_L==O=)d4WXD|O3sdj zlL|16AxLtqyneGhHr{!+UKfaH8ptRXvnuy#&UXxf5fzyU`tI>ICd!NtQYz)H*J^{< z`iTB;w@A5bEcj{S9acLo5DRdetr4= zUSr!X%|gUUsk{W<0F3zI-_NCcol`ND6|a!>2{83CTad%YQ9~c~UMGQY3o&kkM2hEv z3(l!?aKzX<3@y@sO?nY9?7502@Ya|cRiUV^p?-mRnDRbZ)0uRg( zA;yiB!aP~b6c02qZ9ca<1yh)HLc=nJ^#f#$jfioZgOGNd9;F#ymBux%TD{OAl2M$e z0f5AE-9)AYa)a{|Hw{&|2v?$b9ER9SmN7NAG!=2BH(gj754JJ#oh_ne#slGv%Zq^zmZu%ke5g=?fH*TM=OI}oz z`ny_VW+Jr<;RDb;ZkCoco#}d6XIij5z=6WFEE8XcRtzE#m3>vG-g1ix!&CQQO9%w#XE2?|oxe3?ZIF6M#kAC{1#!XUrN5(hi?t*B4e=osKwBF^u1C$I2e9fFyO2v}I#fQ)d!r@E`9Gb-s z_zeP1A2K1ngUrd9+ZNZYBVi!6!fkrGD$sOd?#zFwLi(wiDJh0D-nut232Vlu4ORUB zHj-pRn^ssKUnmTrs=7frWR2vcFzp~8bnXNjnj+^F&4_gx#Y|Asa#L58a#Y!?odXXR3>2^ ztQ%Y+xJ_vr_m56DEYHc+;*Q0`)htV0Tei8eitE?8IA<4Gq?o>LtSg4UMwHC#Z795asXJ|?WG58)Q`d2<%ip0Tavwa#z3yYch`Z>SzdZ&uQI>-srpREFf_>9>40raVFN zQb>uvOkHRVg2?^oT`P$MjoCyX6HJQ!VEN4~SHh|>aW&zXMHV;AcQSHR-(oGU{zfcT3Ts(b7>D75e+73v@1@N>b z3@lj-&mH@gM%JFwr76-X4hXNJ0ZJ&Rq^l85)bKUG8x+BwCD){9w$?*D;We)S%%GWd zF9(bS57ZZDEKBEm88`O0w%+G9be^}mhQ@{%qN>XB!H-OnbOqRU)T`(aEfqqRCCCnS z$C6?T{L2Z<)C8hYzP>_ic83dBHx%99ER9VmZdeXI9giu*uGy`Z94XTrUJ#TKlE@S3 z;abKKqptFjtKbZIN_5O}Rn5_4>FM5~z!BAFsqbxQUYi-KD&`5oYF;-|BRTI66-d0c zvlWdS(ps7L27PTcQ|t;f$@&p`l~S|6z?!Jp!*`b(rpAr5H!R?BE$N7yCgiMEYjU#P zGlTfLzPT0{`U&Zu5DBafd>J&{CLP#4+kjPHCRZ!h(UnezT|7hseN~&BezEn;@GAJQ zakblj(Q_dwc37au=kC)BR1C;4*pEppPS6X0ezC;i^#i9w4Ty&*-ec0DV$TmUT3omG zofK#yXy4hiM)CVtWX{U8y6AL)H6127wM8=4hAAQ}4hrC)grsiGi7IyW6-R_FEYre@R{I|-E zevh}-jb|tBA4F^7fNzxSy&k#QEXnm&pO)8BHt?@bY3L!nG9nh^Ailzha^?GkBwVi7VQ{Xc2z=m+@DftU3{bjsulWQDUsdjb zHBDeWy6zrj;11EQ&7N_b^JDiYLsjG6u9(5e^uq;pb_g?ui1leZI$*`TvlCll#ygmq z3&?}~9W6nwR9xGIr1faS{TAS!oblXB{8p9gj6tJjjJC{M+qFFUs!}U~VB8Y`N;N`G zBZa_DTvt#ior#s*#!YtFZMIasZTX^l56a4>Qx;+f9GbU)kUmO$wT?&OY@z|xw;Mbor5pPUpP(0~_?dU(QlN{WyT{LGB3j6a35Dtrh-Q9GZM12- zQU-Trv08F2>+N8-UAlOI>n`Qnv}ji;qf z=S1iOXpCQ~ZRe^WThMacjqDF@l5=07H&55lSEe>YH0Q>}aJz+Tll^;;4Pe1D!OVsiE6Q_T;iz9>_Cm5c|U9$uxLyXRUw z1O4HLaGN+z{bt{0P>Cwn7$&nMjCA9Ydt~-vnC!~ zBgvBQ?OVnOAUfmfIjU}^W~*>a5}M3S}AVSC|PexY3{#t}tKHBR2^lpFdc zLl?c)EEvVoz*kgcJwS@(+3lyLtG6PrO2pGzxnh@V5x5yhnvcMxf7ULW)Lzab+Me0I zY>(i2_Ry7Y+R56QvNlAiQzhYL(6wy^LJ?D$(QS+QNi00&5~ZwO=+~IUvt+(AphU&w zt5TmMzSHUAm#IOZc3Hh>U${e~^g?7TdneSOn!BOV{}E>}tL4KG#)DS-s-`(kvs$v2 zgwMM}xJHU?uS-9kCE+tt0X^mxz>05tBuG^Gk&8VkQKDY>h?3V?{&GX5u~=rtmMxKG zL}Wh80iGA~%GDXhSX*dyqhN;Dq5%uy&Q}>OtL*zdZ@Z;%EQFpFSY~1Kgm}`!Q5e2q zjZVFmuG&OOk_5?#$Ky{AuU78%pI=DB>?;|2{4k--%bmn;eMHw|gnJh<7k%#3ofQDN ztfxpn+Ms}TDP@DMMCUb~g>Hbi2p*s3Q|ah{1@@1Xl%If4AYPVs+dB8`5Nc+Yi&qQ; zNMH<&CgXD3y1mdOZSQ_+BS=9O%Wsh3(L2dMxu%BzPF~~z{oTq4o}@t~<^=0V=U}iKO*mRu5{94JsrpSO3<7y7UnIyl}bnwR=C27QST9YjaK{18bZuW zV@x?s7umK)0@3goJLTU9h6wanAc(YZTs#qD6&okL+5LHgvG64d4#wP0X9S~w#dsSf zKoUX6?HwOYPV<}+q8s1(=P?F0C_!0J+YxR*Pv`2zzwny837LcY?ur1J%*p?-*bVv^ zCh9KdB|ZtJtq((Btt`TCSYvYex*kC8x7}PDVeEl^Hs|IRYiSb81;QH^b2Ti z^)PPu6Rf8NnPv5$ZR0TveA9k(@0afkmU{i-oA2a5j%@{BTUdnxC<656eUR1Mar5Wb2MG4sQQi@fbP1c_GjzPjh6a`$+ngU zZleBimVf`JVNB?c4-kJ{bkGdVkA6PIK5vep>m#`;m^b3FzNOCzj>*7bqa8;hB=S4T zUV-!m*RB~yPyd+sTi5XFIJ!VPSqqqDf_)sbsX=b7U!&jw1aVjax9WWa>=dWF&0Esh zc5jYKd1K&n$6w@OaW|6g4t7xQEtajJW6f+Kv%b?;~9k9}TWmE3?em-?SY+9i2c7KPvpLKYlseO9LTf zvO^h~;HzymC%qt2C3k&{SFkYD(zam-`GXEe&0Sm$u4S~X=5}K$pVeecgEwZoxk~i{KH4gg;&tQZ^u6i ztmLvZ*25iFnLq<7&f09P4I$42xk$e>j1O-$^|oPZ@dRO*KNY1mlrnj0 zbAp;>2LRf#Mur-qg?$GW;NZT|QF9nkr?SP5#sZ4hZR@JkJP3f|Pso8>vuc;i3H{Qu zhy))cjWwe*o1BZ}S*>0QpiRSXzW;_~Cu*w3Pjh}J#Roa?)mH;^!06C`B%z;P`cC6X z_2Wb|HxHlyWs}R?!vA)o4{ZV<*eQqp?_$x#@&7}Mf$1~H*}>GZ3IULrj^mp)`M5DC zf4_B|F8J+-8WH?@~nI?q(meR5R$o$Z`~HC_650zL2tbnzR->&x|8w49OX7EikVHu`9jf zm)BRMHAN1*Mmj`zC|H0nld-JK0ZtG5YrYcH z`{r=?)zAIxLRg${lAOPy)7W}Xl0&J3TTHU^yk>$$|IWho6g+~h)P?z^0|wn)in&v!BoapRU?Mp4pa7nYC@}tC7V{D70H=w?TP@~cj;`-)A0AC&<2Af)(@D2LD7D5!SLYmcDXubXP6DbFZYSqBCl0B6SU6F+yK%x|4_A7JIw; z=3RPF*uGnP58062Wh}6<-*>Ox^& zDdV@8PqrO#B;-yippLHHfnV%p~Sas^Klyi1oQ=2OBfgI z-Le&a2K%7J#{+}n&k=f%9RE$DW*HG48B3}s?iOa`R-xm zrs1w=of|4g)+tGchRiXIayF9#C-=5u4S7QA$!4EsMVf(6wm)mu*S{#3X*A+`*>{{f z?$K>}Tdm9p&VZJLyc$^?BNClUda5eRn&XF|MwP3T&~@?2S)BaN%Jallq0RoSw9yG| za1u^_9T~M}T8CBd0k1+^Wyl&q3zBNL)+Cd7NweLBLiCc=g%fVH>&J;r@8jh~`T3{? zLUpL+#i?LzX{d%_zqJoYj2}zu_ag01z=e^=wN~m+)Fb#!cnK~-n527^; zr#Oz~6zHDq4J~)cXv-AhDMGkLtugNFx}xZ|<5+gV#v{aqxZVz~I%4h*x#Lv(eFaUs zF1A*c=T{-A645|33iV%rh?OO`fJPR{t}noCn_~c)=hWCX{8F1;Vr=+ zHdUl>q88Dti>+)y*ia5ir6W>WZRt7!qa|zZowNc8T9QWbI#Xe|u?m_B$_&Npj(4Jn zi66k|zE9B5N0%9_s8HB-(5${4(qm6#$;RaDb)ZPSe^^;)&w>~xyV+4<`? zt8d&Y?c>og^1zFlTRJul(J?c5zpGF6olT`@vQu;ec0q2KYtoEmG4)Du$0YWR+;$$XR~kzv788qQO7&n>Fo z>y5EXweT@~y1J2Wn!2uOs%$snCM9^+31?>Fs;6@h%1E=+(-mF|2F--}Kus#KA{ft% z9Tg#cTi4{g5F~rGW~1S8yN#ox3!KkQqo4D`hXrkd7vk|wch5>6I$`}J#@R{S>MGWT zL(%MHZ6W|A0#aBB2MczL?>jh}oYG*V3K~9_R=Mgoy(7ezbcRR~S!1k0vw zm;{iLRfeV;x~vG|-j0Zjo;%uzBoeLbkTvejsk@^pz?_X*C&miWrC9}1UYRW5vQYd| zNrCI*YQd?B?$q~cWb?EF2gz;}m8&qz+NuIRquN|>5JkyCbfFV_pp*w72G81b2tpz1 zFq5up*5>9&%CO5hBOzK*DEd1Y4$gy%8ChyTdyVlVv$2uiE)O^cTRJ}C)8M2dY}7GW zPCT8m0$Bk+kyJETybOs)H+yMRM3>PVc z+Yo84-)oiv&r*?QNB?Q)87dMOo2pSWrgy#A{?I5yrGhGQsbboN(oC}!IEE88+KTkB zR-s*2HAQjj1c_bQHw2>So_GZrqw&4>#IfFKD%kkd`BxpfrUgKC6lgis6tNi zpoB)6wY*h=)>T;bdlDCa)_}b>)W|AEsJu5p40dEr0*yFy$omQQWAIuH(`hnT*0kb6TZD z=&r;oqJ$*@jIcQ%#HipVl*9Q0@6xc;kx3^`@miBnGE_!1&U!0;$jt4^v(`Cts zB{%0t7;H4@7)RF#X0eXO*gKW?pPawLEk_gW0w|TF=k{LY=v8doXiC=v#H zC=^Muj^nFbpwwS%%aby8C?*rGL%-fkIPM8CbN1&FVKryab3GK3>;aY zKrm9u6M}fzvnMCHOUZ>2MpWzBFRv|!G@Mv&yIj48!bzzzC#90ESFYgN%uH1YrMLJs z$tBW5tK1Tn()pyY$^>Fimnc#hI;+bd=;kgamEHEqHcm^Mud<4!Z1_#w*I2Hk;5wze zy=Jnkm~h%p3pPHr$p^dO4bLA_45~PU5fJAHOby7KWp`m{wih#qTQpQz3Ur&f7_^A8&LAEIhSv~H3vgV; z5RsdELoV*3+(L8UWdpcZBnYL4u)rIgWSJ$8h#``iCVwqYi1%w8_6h|#)s{SO-$>P3 z#V%@F`V_xy@``CCXx7C-9eQm%bX|2In=A>9wRB*@BHZ)5QjsKtGjz@wCZ>xmjM&`k z3P(*DQPbsVKm@Q58HZ*Uq-&;Cl%#dUu`Zg-@aorARr?0oQsphnTSjT?(rVG4fO}#D zi{qeTS1yud>44{?rwljwgM^EQkFBxLer@_J4Mv}NS{oiBb8{q5i#8KyD+XHEN(x5F zbP*^MQt3m(Ys1{qa|X_GH|TD}*xH;&^Kta>U|?ZJTf4-c&q{p0wRM?}_+iR%YH;aO zy(desN=ESIqa+qP)-XNI=GgN(;-vuCCYQuxnpSw}yn@aII{$0g@ICsJZ~1*@D!Zh0 zU0AcsYdt~dUxn>7Y{R!uky$0b!i3gCAh}tHmOXiR52O_R2TJ5#uB=`hm?)cNTHg%f z9>kvM)il&IWteBF7i%-3a#{NAC8lR{j?-ztv5Ox=F|b~i!#L+dQa(MknL)&Sh|A?FCDa4fFBL%t)a*&+^$1Hliq(7D?VTLYC=k)8fvvNYA9>&vI;V;K>)vMH_HAQFgg!W3F zz=SuZBbs_2MYmXOx9*ppDbwcZHTWXR&zczqUv_r#JEv3VJ>f@WD%GuZM=*`;fua_f zt}CrQeo+9olNcSA$d5)g;JfzOAYYDmw++m=kVy8-A3ro^)qGgAk7N0@(sz1#NJhLQ zy}4PRtpuJ-H=ZDqwufsnzP*D1yc9qn3y12-U_kV9E6))4 z@T-}*UC(yL1Z+bI1Gt4iY_&NJ_K((%h;wB9?CdPP`@O`~$D6yu$MXfiZIqwME2ER8 z{cU)1EYjcz$>8~w3jv|ML)SDn4;SUccW#QmxH*YO+N=a_$$m6_*B!xF(mdd!F3(^nnwADv zs8CT4acMlsdMVLgDQnVx!VT3+S>m6C-8LG9g`aSHw0n)H_OvP&HVZ11`)TRHm&SYR znTYj-rAz0SAMPw3P}dZWY$OfUXjA`=LvR@GzD8|W8C~>V%C@XOa2$m`7Dd?KpX^SC zKpZb+pWW=8?`#)YxW3}9>N_aA$+s`Yqg_Dvt`N^Ai`3IG?p>_vpOsR^I9vOR63MV# zRu-oXX{K~aTZmm8#SWtd<#2O1E{aS=io4eIgdEL*lB8+PV>*r6IU=Xo_AAPkK0?+P zFEa8VecR+6Z=$-m#>!gu^NmG(lf8)@ z-!*h!%Q~GZ^8cr;yWKFn#@@L|lOcO6`(;Oj7{!QLg(x4|8BeQGg{8|3E7+o5;NPSk z@7h7hvphMEq}b5n4Iy^4=A>HYl-9ja$fyFoxoKxY$!Q6DB!`XWh;HhhBXTPm$DQMx zSq6K0#DgGd#M0BQD*V}%#+ZF`jh@a4E&2G9e??6;*Fi^A9x>cOZYmUn?*K$T2pl3@ zFuIGcAURXig)7xTEh>;?Q*O8r=h?Zj9jOJEnvGpE@TtqbSM3-aRJs-w2*Ai(Dd@A+ z5i3av!3_qEAGG;Ys?3Re*}}k=!gqkd|D5|DA?NS4r)!;q4BD#IrIHlll2K5z79hrN zUAqz56fVfunMuBz$(@Udf4)b+@MVv$peoVeId_<>MrGu=q~X??k2yD}_G@po;23!e z5;&mDC5VJ7$|83y$Lv0{XL`Q$#Glq>)1|9FR}v)QASm_N7hiN3ajx#+$xULJSTbix z*$huqUN9GW2uVHB9&%^KRwE%Sp3k@pNZ=W2(3xhCJanEVl#Vd?NsL_F9aA%=#)L*g zAj#qC+YEjr>m#`3zSZDIO529f7;iAt08!QpA+AFbqTG~6cByuZe5@7y^ZQ;j_fd8? z%6KNa(o-!b+;)dSxCiVV)4MprV(&64WS;Vv){Pv6brpq9exYgydZMk?^_^0f^_(m# zUyC?2d(gtUwz*@X7^Mu4P4;NXU#kIg|NZ-ug(KBa zW28uu5F&ft+mZQ~S8Kia?Rl-mXKtOdRDJcAQ%&p}cPo6Y1Y?*$jBB44^*uWCvHGu< z+Ox}3uh{LYpEqD=F|ZcXj4VYmCX`o3_d0NoKfDaXdBxHyA-zt=RNeDKy}Ox*^#Wm5 zjz2ytnxYeIe?QLOkBTOezliT0q=w(Wk1;nT+#9+|J{&#cGEY8)XAQ0a#J`9MW`=MsSc_uaA z#^$E4>y3aA#O4QWgN@0E95o zf=_ojLCPhKYZ4Cz_oM_jlm9%>QV$c z85m)+nM6vOQFK#J84OE9m`W2M54JC zVg0r+a100tpHp;MIvaUV!MsTaeOe7{r@CgSU>N`luWU_(MDSIosmnl*3=!FD`V1`3 zS68+WHf%L#B;ZuQmkEfsf7A^eB0!#-MH!@eRLAzNfJbO5Rx3*4ReIn+Hrs%TCA**= zQ=bAMw2%o`4Nl(odANVTFiHeTAcceAl)24=G{E-ISayMkm0*+AShKmU{31P(0d9#Oy!Du!q-u@^$cdKB@i8(@3qwcoX5?Pmid7 z2a{w3^1WIQOl3;>!c01Mz$`xtZw3WqeEU3qOrPAmWp~50bY;2EG7B!!a{dZgSTKtq z0g}&YN%vW49DY)sWeP}Fp+@{cv~=?$o|Wj!GeH0`U#1)iWeVAl%UfLhNw7AqRzyeS z^{B$u6-y8wTuJtfFgN7bk_mBx$xOpODYg2 zc;FmksImv^VMF;+7#7q#0|5tx@KVzrDTI>LXFkuuT3AJ~=up;{*UGKN++IBfH-a*i z#K5>z0gN24>?rB*!FU=kw74)ebyfpQd!lCW71~QKH5$3*!~}PZZgg-fqM+qUe?^aU zMq8bV6k_X$dHK_`pM+MgC+Q9_Ud#}x$fi)7K79LXS2;K=7j%LeFI)vPVJF@@(s54> z1b9w?oiQnLP6d60;cKg6Bt z8|zt!%yEvJtVI-Ua4+|4TUQcg@>i$>-n`*N; zolAB{Puls`g88fS=T=&~>{KQYOF_Kzjw1N{>~fOiwSI^yiHrqa_dR=(twQ05l!7@+ z=BC_aXKB4I=XG&c(u@?nL(T{9!)Gj;dfCaS5PUw4)AQ0;C#jP&&9aScVs?z0A6%OD z!Q9q@^<^%8%b*G6hKGM#I4rf@Q~%j%teLg&0ta2Ebh5+yH~GDLY5_ab$O>#aQ^6Dt*aDTlX5u-ffyt2 z1Jyc%LU!Si2BEq{r*dCjUD_hUEupke&?BO>eG^gS($24OwnNUW4P->Bx z|M$|y`0C?ZPzc3dt>|tsyKR`qH=Gt&Jj}ndlLv9Db~Bwr6vSTQE3I|4cRb>GgER^4 zXqgjcu=hNf-#mKem!W3mH=c9-jdlLThWYVqP5M|WH!b36^RCk*(q80WhlEw&l%5*~ zE`=(jFxhNMh|*OawLXezQew>jbSquv0B+@KKlcvN1Cm0~?5dcHiZ2vhb&H??#ejNok66GO;%GU+fncES{D8Qo7t;LIOdo76 zf0&g#-8k)Cm`q8FgnRurMN=`rZwyusC?PEml>?oVnZGA|I0$7&}<6&?HnpDW3V@>RkT| z!TG%}p3c?wGvpZw{lx=Hf>H9GX;Ncl!%+Ur_?)z7$2Yc&D})B+I4&z>t+hoy9MQBA zXzF@X`Q#RHyKqMI!$5xHb49Tb*)%2oir1W=DMMzou$Eb?@HccBVdQeJTtD#X^AFrl zgYVxN{RXFkiJK@+C+{wBiIzl|O_8HotSM1 zVi>oQwOwh$R)p=_lC#)YhuVUQFfg?8)QTYpf|^t&1#5(zs9>#aqfa$0P`CLn)aD4$LzE z1}&8h#D;TfydjO2pl!f&Cyz^wf(GF&w?@jwhQcUDgB)tDF_yIZ0767dz<<-$0s$SS z&v6>sf^I?IXwHnoq~s8GKr+w#(%3naq!=8bs@WE`>_lQF#WNTm+ zP!=MPilXwHvW1_m4V8IxyMM{=-``v9iM(k}a_ZE>TE)0{K;3z~?lgM#e1pAehx`b) zT|W0$*obB`7&wxGS@@L<{V-4%({~t3an$6pU#s)zvzzfZLSYb%(5b_LN461tg zSwVT%&|;xp-%hQw(5i3OUe^ZgIpR_RP-SEa!lh|$u@&{V(P{bo=kJ=MqD9~h;BCP~49JCa#F8%AX`)pJ3Uu&E`V zRA(|p9@hegdZz*AYgQKmBaX-0&BM(0K=g6Z(gKjC;a#ad1z0>t`*{^gxf~6MQXvYYes!|g!%MA=drs>0F()o=4E`f^D91Eij zCp)dH)t-Zu7GxdR*&@3oG~jWgYJt1t! zpkS(3zLBKU4B1|E7GA28{k9juF4fgyZJw?v?b+3)P(SCDbo&2<=_eYw zBgR3Ul}fwq(}_AuaSSwfDjJ>|fziLt(2`5@kpVgc0d$4EJ@LGe?Wd}NOjmV)VmK+L zr>kU&fe7zbE9+C&h26gk0=Y;!euLBP@9v%|OxgbLW@YN2Q)mrBqqx!|J!BY1j6ybH z5cW!he4`W#N=ToCKK~9?_#Z3id+%c#L=HD9s}iCI`8RPhA)}knLM#md!w9Yp(+;0mwxRmUFISfh=n& zN0JrCnuLVzSzc@Xz(Dj$ZXnfshYbVdBNQ<3oZvf>zI$WX=K=>c16N6dpb5yHl+7PH zc^B)wcZ}ccI|qL(;~Cd+M`BPUoX zSvIuRbaf|!*uNvTRb*o{DmMkKnCt$xI{Q(6QS`SM#}?uXj)mmQ^<#K=e)k!!C&{z= z`ul}kUo(AO#F6U7ngV6R!OM7+T&`c?ukdPJKpfb*asMu9eIw2iJQY-CZbSZOYrp0cz1cf<+4nXe- zU;xmwe!18}b!}kj4dw6|v?~c02nJXrKeA3ek#0em6M&d2_a|GcV_x29FT_FE$(4s@ zSFC2SpbAnpRUT#)%rd(C8INFcW2}Yt~aMjy(PJ|RC~R0%HE!|B|ThwkiPR*8jzEbbI{6>bHvhv)K7% zOto9RUUzoZWBYR;A3q?H6W^PJ9&DflN^HDI991wm7f_W{ztr;Aoj7)QQum^ALX;|{ zS_x69wll5tJTTI?ePkk&A8HSSIa;i{i2qYnqL;*2s&%RaW6pIYMpe^4aQAw$d)n?L z^@JJ2wjw7sq@Pd+E@rKL&_FI{s&Yr$q9>bAqWWo3tRDym%`f57gIo^?-_>J5FNY}! zLZGYurQ{SC=O^M& zY3JiPB&({9wp*EsHUUUlz5ZS}YK`)Nn>b$5);P?Mk&3lsIkvv5x^j;+E2cMF8e(0u z+(%#{^s%N7KaH0w#Q7RASf`xI?=4EC@NrL9F{jH6dwGu5w~^8b`9$Jc)O6w9J30&I zMblGiZkG}^!Y`~7j4V(rYnoOJmUXTDjeM*z%%X5cF3a))|E$E*R8O9>fL}k}5qGVr zd}m?+2WbYH-jD?x(;=PNW>zvrXPQ8uMt$1`3opnTEllM5mGa5aI&h<+u35HDV+A2z z?-RpO2FbEgDC}AjgGc#&VFSl9ZrNjVhxY5nT3D>6CIC+Nc2G)l*$H z18$i%Q$ouB!3af;6s`TdBe0RcnBr2U>vq{H?iDKU*5!)pxvm29#$VVWLnZDA3VTZyD#BlEfEc+o|#G-|h2vN_AQ-wCYgbyeL?)SZs?m#SONBuARoISYYy-gBi;^=l%ID85 ze@VEN!f^y5ffZ3!n@lHIa$5U&Az7o4;!RG3v)%x1)7^Hawear zXdDYxsI`LzjvMXX!?!9G3(C|)jxT-v`UviL2$BB%8n6lPbgk!(Yc@(7` z=J;Q)qpW*bJ7ImjGB@lomv2?g=6iKXq(r;^s66?t-(r8Y*$k1eK)t>y&as5dfJmCY z474q+{`LkIQLPTHB@1Yz3p@@-g2?1POSHv5xPHZX@Y?G;u2dpfHeAPXtMRp4vOF#B z5V|K+oIq%H=Z0n0Y>FUIOv%pC52BY$B2$ToN{Q7<;gSkZXPyCA6sPM1bmbfxy<$zn zLsufis314vEYa#2wbZ!RJ%?Bp;W})R8-31{9L&yBG7Py>=*ptLvRD(v4HTz``@O}R z4_Dsll#uqYi@1Ynmg5vr|F?$F9egQx>iqpM>FWDXS|8JPSQCGpH{?#Lq)beV92X9* ztL%d^qYF<103_;76(6ekA~U9Rovd=$qtF{&G=gnJDyb!G_oaZ3qS>_c;uLU6O>L@1 z)fHGwltAHpel)N9$jJc-C<)UvGlAzcQ$xR|gD4PJMt0>P0Y-FOvPt{Ie#Om{aMLvM z+F=7&NvZ2813W=vt+pg8~j1S;TS(No)psm3N;_f$@Zd%*(mzZSrl~jsz6k_ z)22AIs#@Khx)1~PLj8$~AI+6}ksG^;j*xXzRdII}){(9v22d4M2j~d_K4-nHJI10p zrhIg4)efn7=f$?h{HNMK2v^NU2~{at*Pq~{b*c5!g^W3y<)(){(1b7VpFSDZ)w=Bxbs5r0cj-jnQwTd~B zXHBhX>Z2Mg$X*Nit8U@^t0HwWf#inVyhuS>Bts?$rEq4%h*X}VlltSXHb4}q9Oe9L zQ-JIQn)a9?7pXhdo=jL+c|Y|r4ZkWxF_t3pg=`7y?@_j{6A(>kkF!3T7ZFP(VqqT_ z4axW}?e7GFHI|j{dsL~5 z>1Z0#H<6VpeF>SWFlYBqcrX666{7(G94*P7e!hDn9#}Z|ncv?$E^U~>di?AC5ItTB zu%(=lgr&znUksA#CA<8hQ#|An%E2PW<8Zo3qCsl?=?)_69fOT!S45esn?7+U?&nT6 zag+%jTy&u1%G*`5I>R+)O2czh2<*l=&sn_4nowR+C z%}lxKJ}luNKZe_nk+M>7#wkaks8nU33@a#2(qWP4E{X^PDm-vxO3IFcAJH-b8W~A8 z4wXYA)j-1+4vD?Oz%m{ zp3tC)ysJrljryN=aVg;|^cx&GY1-C>xFM2rW;GhK_qMseL!`%;PL`7LF95J}hJ!~4 z=Wve>KDTyrm5ucAfZw{&r)S`}ZuUzv&9imJH40h72DY-E%}_YZulK~&_p9$TfSD&Y zb>rq$8}&KvX6xdYDKo=cDky>6BAo4avPm!Qd?)-nkNAiyT&DK?*!sMX_w3dvtzTZ- z`TsbjH-q<6ZrJe8@|1u8YWxQ%AO*ew1~@Pb4q!O|-~y=!Ie38SV1fYn88?K$7Ed4o z^20VRhNM9jVMu{e&h%2CJ07tAc;XKcP2jFaoeDatfd8sw!6+i)XYS0WMrUe2R z(us3$WRMv71-t?QdkaEnvxpX`lYt1C#lv~vjU>cKE90vR-u`Nk_;%g!D#VQzAcdj& z@EYVb0vwk59Ir#r_#6bZm0rF6b|&%&rrx20oRf5=5(W3K@bmbjTnOUc8GazaJ`D$UL9$j-vKSEVw@!5= z51ntX$180{la>7!*Ak7-51&s@zl+LFejkg+JNI~4`90&w#mi@tXumQ>JyoC9;*_wT z)0tcxm7;2P_j=)jmo$5%L;r3j)?$RZqeFMgFfD3S9^2?E6lycw2QvdVr`KB?E>E@d z`0-EDzkl_g$K&)EFn6$Icn;Ri?V%%k{H6lREu!O!dukA$Mr};ysGikfveSGoy{r5- zQLwvC46T+v9R1)}n7M}z=ERz7JgDK1L8m5-y>FIznWaCfHy#YyxNBqF!$$o8yekyU z`4a57%U0)S`(&^-_jrCz|Yscv_;cULV zne%rz1dk998H9+)B62XDJfeUoB1(ud2vdP0kq}kYNH(B_1(y8Au1Z}EtkhJCmnfvB z%}hGR?@wQSE@q%1|74`GWKA^18XLZ6)?5oMwUUCZ*4kj_m*4Xv8&@Q-+(Adyd*Y-s z4!Y=yqi(wE!2&0o_0-Fk`o9};;i_x-=<5poTuEB~+$cb*fd&~Y%@9Kkqt#J^TzSI{ zcTwpw@Q}$9KWi~uHl9YvF;WaKZWhHY;<(%{+8BBAja6Wr@g|sPl0uf_ZL%q*D#8a} zUX{kl#HO2JCM(S1S=ma=G1olvEwIoci!E`LrIuN4g_TxWZH=p~b&YkdmEfO*An6u- zi_GY_EV81PjC)BWvULkavlKv%2|q_jd{You6e_oU4s^uJDe`dO%!$NpirkyeW%I7 z!tU|-0x#^&qLLUkU5|xk#F|>#I?Y&`<|_2C>FeHi6L9Z?Ay7JOmSn_^X0W*Qq10mm zaf3{uB3+O65@&b2YJACW(34;~<+X%Y8TvZQi2wM;lC##m49U{}v6RMKORABtn!JeDDho={jM209-8lAypu{m4?9$z2~*0-XPvWlvjx`r9C zrk1vjSzU8_uIQVOf{ad@y$Jrv)IXeGrrb|>l$>33)iu}Ma1#IoMc@-7sD;BPwuzew zQ3MKuBakRG2Aj-jVAPm#2lYGUG#Plm_&n+{B8g0)(&!8(i_PKk_yVCwERo9O3Z+V| z(dzUDqseTs+MJwST;1H=>gSFbp*`xY)7<&2=T{$<1c!6Z*=YHSRn-R;>mYFIOp@&n z3WFn%S*wGJm2bX6_ERSGeyz9-Mu*3>$jM!j{UF=~W2ROtLj&XuhoZv$46gy|bI;MOoEN+x0^n(*yt^7(p?d zASv~3Da-Lf3`(*RJ2k!d4e7X^f8H>PlQhc8yjm5GV|;?ahT#yhW(h2G3f*sR}%+gqX#Sh&baH_Svw0U&Rad3o=ORS@}~hD5?bt^njLabLW8O6 zCb)^?UML9i5Zk-?xQdiFH_*MjxsLa)T#v;nenH>q`$l!F-y_=lxgHY0(^EbI-1Xgl zaMKwgL+5G}Ue>~QfpFgJ+t|PQvmg~1@DSY{&hVdRD+wy>mN*sd9KEZRY&LtH=)0>{ zFXK@t3IdB7#J#bu^6r$5TrtP?R1?FWu1eH;<)njU8ST=QTVHOA zasAyq#m|Xe0oB8LD2H0@!d=4_(6N3C*)RV0H9PoCRl=*+VO$b)?9C;zav9L;%-pXR z{WDG6h_hMx^3?IpRD;Z@Z;UQTpa3|=6W2)%p2Jn zrpF${f^_*==XE+*&ajNl4}LwH^-la1w9INq7{yJTCutTQbc3LoOeE-w&9E90MlboE z(ZfQP8KUu5l4jxA{C0WI5qI>X8|p4}#RPfa@8|VNEqy?5ga#_+S|j-3BKw?+PguQ^ z-HS}J_o9+I?VZ6iij(}>RKTLVX2&N0F{ZJ5nd<4ukJ(Ez-dfT4DnB(1v;5u+HBBEn zbR6enJI1j-Z@iqzuFIvo=v8k<(E#n@0r{|N3I;un+~ednO%jZpM{x-syZ$Ielw*f` z#1m=koo7@x5l^wPu0cHb@z3CwfD+E{QSEyL*8vP%`C%F+LWBqrB9uvl2oWNbL4<~h z5FsQ6fgLJxmk-j1Wu3J=<<7Wj+K0TBhLBCs_!+vA`0lunjjWc6lzsMvZJ1VP@QX!c zLB-^}R7QmvtH(u>)jPYb)asF7=}1ku#TmzuXUj7#om=UfCKcI~65WT_u?X=jrED5S zlmz_i<#X&@5E2L(11TZe5|}1{(81&&lrn~DNmoN4pb1-*Ac0WW)+7m`7*2$((5KVF zX2_yMV~h;|00z(tds;9WELlxXCMnI^R8^`C2*Mod+>EG`h&Ve3{)?OcSb+;FLjoaV zG$o2(5|}21(81&&lrn~DNmoN4pb1-5Ac0WWw%PtUp%_ktt#L}QE%6h#Ns z3wv6Kpcp4>;v|I}Y7j#UAnfvSD}iB=aFqQ5+Ij2q8e8Mjcx_40VbKQi64xy9=`n)f z^iEmpZg;~8l1ig9rt>bDH5q^qjG!1EYY;ozhJtE1`2@~BPF+KYMgkhp5NFDpcqbg`dcd8 z|9}5TV|th4p1Db^&#nKxO+A&Cj#^0*df>`K-s& zVIlj>!hNN(%Sc_%t1`o0c2udO`FG)5Kq`(nny-$hkt5W9=fvXGD)-}0Zu<$=TzpA$$XXu|K(tM|AJch>!>We|u` z<(?DQStBmOlY#KAF6220;}0opm;$2T1?5sg3{{mMrv@JqWb!6SY7$7=tVovO)alBJ z>cI4@qL^(*> z!6woT1UX^Uh{=g3<$zBT%XuGjcB$yHv%^vv)H*De&naeYmanX^QMRu5b< z%h7G~{oFSq+a2y^a++D~Q*OU~SPfvgYh#HIGAz&5+8WuRW2#RO+o2SB;oIsocn)kp zdOn@**Rz4w()aN)-}%eg*yVM?A1w3C>R^^FMoZlchgP~DG)*y0HLa!wAp8SzcJA%# zVHKTi%d@F|yinez+LP;FVC7PUCFV4C&Xrkuuw1X!5}2)5u#2vjvp{*49w5=Nw&mhL zTLw|A`sdB+UMi)qH0e!aBe%KHs5j+iob!=-7*3E>8lA!Xo3)X1UWYbTH`gHjgN{4z z3%H1{N3V7`w^~LERxX=nwLP{4bur|u0mH^@tR*1=kPIrQ4j3?Pe;m$mgVc~$m~~rK z5MG`@%LK!zijPH55o;t{)tepIkzg4n4^1X%sw9QH?lO2ldCU`Y8%r1(sE*nYmnRsq zI!KCjFf|bK3jjmmr#dv*yi&Ihs3Y`{s9Pkc7=lSt%wMyq$K{?yV#V+p2dy5IRSeIPK#sF32&be=U5>-w(QfYSz}N code { background-color: rgb(21, 22, 29); padding: 65px 30px 35px 40px; - border-radius: 3px; + border-radius: 0; font-size: 15px; - /*border-radius: 3px;*/ + /*border-radius: 0;*/ /*border-top: 1px solid #dce0e6;*/ /*background-color: rgba(0,0,0,.87);*/ /*padding: 15px 20px;*/ @@ -485,7 +501,7 @@ h4.doc-heading { @media screen and (min-width: 60em) { [data-md-color-primary=white] .md-search__form { background-color: transparent; - border-radius: 3px; + border-radius: 0; border: 0.5px solid rgba(0, 0, 0, 0.87); height: 1.84rem; } @@ -678,8 +694,8 @@ code .md-code__nav:hover .md-code__button { } body { - --md-text-font-family: metro-web, Metro, -apple-system, "system-ui", "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", sans-serif; - --md-code-font-family: Fira Mono, ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace; + --md-text-font-family: Geist, -apple-system, "system-ui", "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", sans-serif; + --md-code-font-family: 'Geist Mono', ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace; } .md-content { @@ -1033,7 +1049,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { .md-typeset .highlight :is(.nd,.ni,.nl,.nt), .md-typeset .highlight :is(.k,.kd,.kn,.kp,.kr,.kt), .md-typeset .highlight :is(.nc,.ne,.nf,.nn) { - font-weight: 100; + font-weight: 400; } .md-typeset .tabbed-labels>label > code { @@ -1052,7 +1068,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { @media screen and (min-width: 76.1875em) { .md-typeset .tabbed-block > .highlight:first-child > pre > code, .md-typeset .tabbed-block > pre:first-child > code { - border-radius: 3px; + border-radius: 0; } } @@ -1072,7 +1088,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { background: none; z-index: 1; padding: 5px; - border-radius: 3px; + border-radius: 0; border: 1px dotted black; bottom: -0.7px; top: -0.7px; @@ -1109,7 +1125,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { height: 100%; background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.025), rgb(0 114 255 / 0.25%), rgba(0, 42, 255, 0.0125)); z-index: 1; - border-radius: 3px; + border-radius: 0; border: 0.5px solid rgba(0,0,0, 0.5); overflow: unset; } @@ -1150,7 +1166,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { display: block; margin: 0; padding: 1rem 1.4rem; - border-radius: 3px; + border-radius: 0; border: rgba(0,0,0,0.6) 0.5px solid; } @@ -1575,7 +1591,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { font-weight: 500 !important; padding: 0.4em 1.5em; font-size: 17px; - border-radius: 3px; + border-radius: 0; white-space: nowrap; } @@ -1623,7 +1639,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { border: none; color: var(--md-default-fg-color); padding: 8px 25px; - border-radius: 3px; + border-radius: 0; background: -webkit-linear-gradient(45deg, rgba(0, 42, 255, 0.1), rgb(0 114 255 / 1%), rgba(0, 42, 255, 0.05)); } diff --git a/docs/assets/stylesheets/landing.css b/docs/assets/stylesheets/landing.css index f364a73580..1ebb351b90 100644 --- a/docs/assets/stylesheets/landing.css +++ b/docs/assets/stylesheets/landing.css @@ -17,6 +17,98 @@ /*letter-spacing: -3px;*/ } +.tx-landing__hero_text h1.glitch { + position: relative; + animation: glitch-skew 10s infinite linear; +} + +.tx-landing__hero_text h1.glitch::before, +.tx-landing__hero_text h1.glitch::after { + content: attr(data-text); + position: absolute; + top: 0; + left: 0; + width: 100%; + height: 100%; + background: white; + overflow: hidden; + clip-path: inset(0 0 100% 0); +} + +.tx-landing__hero_text h1.glitch::before { + text-shadow: -3px 0 rgba(0, 72, 255, 0.8); + animation: glitch-1 10s infinite linear; +} + +.tx-landing__hero_text h1.glitch::after { + text-shadow: 3px 0 rgba(206, 0, 255, 0.8); + animation: glitch-2 10s infinite linear; +} + +@keyframes glitch-1 { + 0%, 9%, 15%, 54%, 60%, 100% { + clip-path: inset(0 0 100% 0); + transform: translate(0); + } + 10% { clip-path: inset(5% 0 88% 0); transform: translate(6px, 0); } + 10.5% { clip-path: inset(8% 0 85% 0); transform: translate(-8px, 1px); } + 11% { clip-path: inset(0% 0 100% 0); transform: translate(0); } + 11.5% { clip-path: inset(12% 0 80% 0); transform: translate(9px, -1px); } + 12% { clip-path: inset(3% 0 90% 0); transform: translate(-6px, 0); } + 12.5% { clip-path: inset(0% 0 100% 0); transform: translate(0); } + 13% { clip-path: inset(7% 0 86% 0); transform: translate(7px, 1px); } + 13.5% { clip-path: inset(10% 0 83% 0); transform: translate(-9px, 0); } + 14% { clip-path: inset(4% 0 89% 0); transform: translate(5px, -1px); } + + 55% { clip-path: inset(58% 0 34% 0); transform: translate(-7px, 0); } + 55.5% { clip-path: inset(62% 0 30% 0); transform: translate(9px, 1px); } + 56% { clip-path: inset(0% 0 100% 0); transform: translate(0); } + 56.5% { clip-path: inset(55% 0 38% 0); transform: translate(-8px, -1px); } + 57% { clip-path: inset(60% 0 32% 0); transform: translate(6px, 0); } + 57.5% { clip-path: inset(0% 0 100% 0); transform: translate(0); } + 58% { clip-path: inset(63% 0 28% 0); transform: translate(-7px, 1px); } + 58.5% { clip-path: inset(56% 0 36% 0); transform: translate(8px, 0); } + 59% { clip-path: inset(61% 0 31% 0); transform: translate(-6px, -1px); } +} + +@keyframes glitch-2 { + 0%, 10.5%, 16.5%, 55.5%, 61.5%, 100% { + clip-path: inset(0 0 100% 0); + transform: translate(0); + } + 11% { clip-path: inset(30% 0 62% 0); transform: translate(-7px, 1px); } + 11.5% { clip-path: inset(35% 0 57% 0); transform: translate(9px, 0); } + 12% { clip-path: inset(0% 0 100% 0); transform: translate(0); } + 12.5% { clip-path: inset(33% 0 59% 0); transform: translate(8px, 0); } + 13% { clip-path: inset(38% 0 54% 0); transform: translate(-9px, 1px); } + 13.5% { clip-path: inset(0% 0 100% 0); transform: translate(0); } + 14% { clip-path: inset(31% 0 61% 0); transform: translate(6px, -1px); } + 14.5% { clip-path: inset(36% 0 56% 0); transform: translate(-8px, 0); } + 15% { clip-path: inset(34% 0 58% 0); transform: translate(7px, 1px); } + 15.5% { clip-path: inset(32% 0 60% 0); transform: translate(-6px, 0); } + + 56% { clip-path: inset(78% 0 14% 0); transform: translate(7px, 0); } + 56.5% { clip-path: inset(82% 0 10% 0); transform: translate(-9px, -1px); } + 57% { clip-path: inset(0% 0 100% 0); transform: translate(0); } + 57.5% { clip-path: inset(75% 0 17% 0); transform: translate(8px, 1px); } + 58% { clip-path: inset(80% 0 12% 0); transform: translate(-6px, 0); } + 58.5% { clip-path: inset(0% 0 100% 0); transform: translate(0); } + 59% { clip-path: inset(83% 0 9% 0); transform: translate(-8px, 0); } + 59.5% { clip-path: inset(77% 0 15% 0); transform: translate(7px, -1px); } + 60% { clip-path: inset(81% 0 11% 0); transform: translate(-6px, 1px); } + 60.5% { clip-path: inset(79% 0 13% 0); transform: translate(8px, 0); } +} + +@keyframes glitch-skew { + 0%, 9%, 16%, 54%, 62%, 100% { + transform: skew(0deg) scaleX(1); + } + 11% { transform: skew(0.5deg) scaleX(1.001); } + 13% { transform: skew(-0.4deg) scaleX(0.999); } + 56% { transform: skew(-0.5deg) scaleX(1.001); } + 58% { transform: skew(0.4deg) scaleX(0.999); } +} + .tx-landing__hero_text h1 strong { font-weight: 800; } @@ -31,7 +123,7 @@ .tx-landing .highlight { display: inline-block; - border-radius: 3px; + border-radius: 0; padding: 1px 5px; border: 0.5px solid rgba(0, 0, 0, 0.33); } @@ -111,7 +203,7 @@ } .tx-landing__hero_text h1 { - font-size: 3.2rem; + font-size: 2.9rem; max-width: 36rem; line-height: 1.1; } @@ -245,7 +337,7 @@ font-size: 20px; font-weight: 400 !important; text-align: center; - border-radius: 3px; + border-radius: 0; border-color: transparent; margin-right: 5px; } @@ -297,7 +389,7 @@ [data-md-color-primary=white] .md-header__buttons .md-button--primary, [data-md-color-primary=white].md-header__buttons .md-button--primary:hover, [data-md-color-primary=white] .md-typeset .md-button--primary, [data-md-color-primary=white] .md-typeset .md-button--primary:hover { background: rgba(0, 0, 0, 0.87); - border-radius: 3px; + border-radius: 0; font-weight: 400 !important; /*margin-right: 10px;*/ } @@ -354,7 +446,7 @@ background: transparent; color: rgba(0, 0, 0, 0.87); border: 0.5px solid rgba(0, 0, 0, 0.87); - border-radius: 3px; + border-radius: 0; } .md-header__buttons .md-button-secondary:hover, @@ -383,7 +475,7 @@ } .tx-landing__highlights_text h2 { - font-size: 2.3em; + font-size: 2em; max-width: 600px; font-weight: 700; margin-top: 0; @@ -424,7 +516,7 @@ width: 50%; &.enterprise { - border-radius: 3px; + border-radius: 0; border-top-right-radius: 0; border-bottom-right-radius: 0 } @@ -516,7 +608,7 @@ .tx-landing__highlights_grid .feature-cell { padding: 30px 40px; - border-radius: 3px; + border-radius: 0; border-color: rgba(0, 0, 0, 0.75); border-width: 0.5px; border-style: dotted; @@ -770,7 +862,7 @@ } .tx-landing__major_feature h2 { - font-size: 2em; + font-size: 1.7em; max-width: 500px; margin-top: 0; margin-bottom: 1.5em; @@ -1022,7 +1114,7 @@ .md-header__buttons .md-button--primary.sky, .md-header__buttons .md-button--primary.sky:hover, .md-typeset .md-button--primary.sky, .md-typeset .md-button--primary.sky:hover { background: -webkit-linear-gradient(45deg, #002aff, #002aff, #e165fe); - border-radius: 3px; + border-radius: 0; border: 1px solid transparent; } @@ -1126,7 +1218,7 @@ .tx-landing__quotes_grid .cell { padding: 23px 23px 13px; - border-radius: 3px; + border-radius: 0; border-color: rgba(0, 0, 0, 0.75); border-width: 0.5px; border-style: solid; @@ -1137,27 +1229,10 @@ } } -.tx-landing h1, -.tx-landing h2, -.tx-landing h3, -.tx-landing h4, -.tx-landing h5, -.tx-landing h6 { - font-family: 'Geist Pixel Circle', var(--md-text-font-family); -} - -.tx-faq__item-title { +.tx-landing__hero_text h1 { font-family: 'Geist Pixel Square', var(--md-text-font-family); } .md-header__title { font-family: 'Geist Pixel Square', var(--md-text-font-family); } - -.md-header__buttons .md-button { - font-family: 'Geist Pixel Square', var(--md-text-font-family); -} - -.tx-landing .md-button { - font-family: 'Geist Pixel Square', var(--md-text-font-family); -} diff --git a/docs/assets/stylesheets/termynal.css b/docs/assets/stylesheets/termynal.css index 436db2b65b..d1bd26039a 100644 --- a/docs/assets/stylesheets/termynal.css +++ b/docs/assets/stylesheets/termynal.css @@ -34,7 +34,7 @@ font-size: 14px; /* font-family: 'Fira Mono', Consolas, Menlo, Monaco, 'Courier New', Courier, monospace; */ font-family: var(--md-code-font-family) !important; - border-radius: 3px; + border-radius: 0; padding: 45px 25px 25px; /*padding: 75px 45px 35px;*/ position: relative; diff --git a/docs/overrides/home.html b/docs/overrides/home.html index 36a9326230..017453a67d 100644 --- a/docs/overrides/home.html +++ b/docs/overrides/home.html @@ -50,12 +50,12 @@

    - Reduce GPU costs by 3–7× and eliminate vendor lock-in. + Run development, distributed training, and high-throughput inference.
    @@ -180,13 +180,13 @@

    Native integration with GPU clouds

    - + Backends - +

    @@ -207,22 +207,22 @@

    Easy to use with on-prem clusters

    - + SSH fleets - + - + Kubernetes - +

    @@ -255,9 +255,9 @@

    Dev environments

    + class="md-button md-button-secondary small"> Dev environments - +

    @@ -280,9 +280,9 @@

    Single-node & distributed tasks

    + class="md-button md-button-secondary small"> Tasks - +

    @@ -316,9 +316,9 @@

    Scalable model inference

    + class="md-button md-button-secondary small"> Services - + + + {% endblock %} From e16568443315b31053bd3be32629b7a0c3cba402 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 19 Feb 2026 06:49:46 +0000 Subject: [PATCH 149/187] Kubernetes: gateway: start services via docker-systemctl-replacement (#3584) - ensures all enabled services are running - acts as a proper init (PID 1) - reaps zombies, handles signals, etc. --- .../_internal/core/backends/kubernetes/compute.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/dstack/_internal/core/backends/kubernetes/compute.py b/src/dstack/_internal/core/backends/kubernetes/compute.py index 870b6bb657..21ac9228c9 100644 --- a/src/dstack/_internal/core/backends/kubernetes/compute.py +++ b/src/dstack/_internal/core/backends/kubernetes/compute.py @@ -894,11 +894,14 @@ def _get_gateway_commands( # regenerate host keys "rm -rf /etc/ssh/ssh_host_*", "ssh-keygen -A > /dev/null", - # start sshd - "/usr/sbin/sshd -p 22 -o PermitUserEnvironment=yes", - # run gateway + # install gateway f"su ubuntu -c {quoted_gateway_commands}", - "sleep infinity", + # start docker-systemctl-replacement as an init replacement (PID 1), which + # - starts and supervises enabled services (sshd, nginx, dstack.gateway) + # - stops running services on SIGTERM (graceful shutdown) + # - reaps orphan processes + # See: https://github.com/gdraheim/docker-systemctl-replacement/blob/b18d67e521f0d1cf1d705dbb8e0416bef23e377c/INIT-DAEMON.md + "exec systemctl default", ] return commands From a46600f9680715845b0d094d988fb021a871054e Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Thu, 19 Feb 2026 08:44:28 +0000 Subject: [PATCH 150/187] Remove dangling services from gateway (#3586) If a new service fails to be registered in the gateway because of a dangling service with the same name, automatically unregister the dangling service and retry new service registration. --- src/dstack/_internal/proxy/gateway/const.py | 1 + .../proxy/gateway/services/registry.py | 3 +- .../server/services/services/__init__.py | 25 ++++++++- .../_internal/server/routers/test_runs.py | 56 ++++++++++++++++++- 4 files changed, 81 insertions(+), 4 deletions(-) diff --git a/src/dstack/_internal/proxy/gateway/const.py b/src/dstack/_internal/proxy/gateway/const.py index cb172f73f2..7b958030a3 100644 --- a/src/dstack/_internal/proxy/gateway/const.py +++ b/src/dstack/_internal/proxy/gateway/const.py @@ -5,3 +5,4 @@ DSTACK_DIR_ON_GATEWAY = Path("/home/ubuntu/dstack") SERVER_CONNECTIONS_DIR_ON_GATEWAY = DSTACK_DIR_ON_GATEWAY / "server-connections" PROXY_PORT_ON_GATEWAY = 8000 +SERVICE_ALREADY_REGISTERED_ERROR_TEMPLATE = "Service {ref} is already registered" diff --git a/src/dstack/_internal/proxy/gateway/services/registry.py b/src/dstack/_internal/proxy/gateway/services/registry.py index dc6407d245..adebe6f41d 100644 --- a/src/dstack/_internal/proxy/gateway/services/registry.py +++ b/src/dstack/_internal/proxy/gateway/services/registry.py @@ -8,6 +8,7 @@ from dstack._internal.core.models.instances import SSHConnectionParams from dstack._internal.core.models.routers import AnyServiceRouterConfig, RouterType from dstack._internal.proxy.gateway import models as gateway_models +from dstack._internal.proxy.gateway.const import SERVICE_ALREADY_REGISTERED_ERROR_TEMPLATE from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo from dstack._internal.proxy.gateway.services.nginx import ( LimitReqConfig, @@ -63,7 +64,7 @@ async def register_service( async with lock: if await repo.get_service(project_name, run_name) is not None: - raise ProxyError(f"Service {service.fmt()} is already registered") + raise ProxyError(SERVICE_ALREADY_REGISTERED_ERROR_TEMPLATE.format(ref=service.fmt())) old_project = await repo.get_project(project_name) new_project = models.Project(name=project_name, ssh_private_key=ssh_private_key) diff --git a/src/dstack/_internal/server/services/services/__init__.py b/src/dstack/_internal/server/services/services/__init__.py index b701b822b0..8dba43ea85 100644 --- a/src/dstack/_internal/server/services/services/__init__.py +++ b/src/dstack/_internal/server/services/services/__init__.py @@ -5,6 +5,7 @@ import json import uuid from datetime import datetime +from functools import partial from typing import Optional import httpx @@ -33,6 +34,7 @@ ) from dstack._internal.core.models.runs import JobSpec, Run, RunSpec, ServiceModelSpec, ServiceSpec from dstack._internal.core.models.services import OpenAIChatModel +from dstack._internal.proxy.gateway.const import SERVICE_ALREADY_REGISTERED_ERROR_TEMPLATE from dstack._internal.server import settings from dstack._internal.server.models import GatewayModel, JobModel, ProjectModel, RunModel from dstack._internal.server.services import events @@ -177,7 +179,8 @@ async def _register_service_in_gateway( try: logger.debug("%s: registering service as %s", fmt(run_model), service_spec.url) async with conn.client() as client: - await client.register_service( + do_register = partial( + client.register_service, project=run_model.project.name, run_name=run_model.run_name, domain=domain, @@ -190,6 +193,26 @@ async def _register_service_in_gateway( ssh_private_key=run_model.project.ssh_private_key, router=router, ) + try: + await do_register() + except GatewayError as e: + if e.msg == SERVICE_ALREADY_REGISTERED_ERROR_TEMPLATE.format( + ref=f"{run_model.project.name}/{run_model.run_name}" + ): + # Happens if there was a communication issue with the gateway when last unregistering + logger.warning( + "Service %s/%s is dangling on gateway %s, unregistering and re-registering", + run_model.project.name, + run_model.run_name, + gateway.name, + ) + await client.unregister_service( + project=run_model.project.name, + run_name=run_model.run_name, + ) + await do_register() + else: + raise except SSHError: raise ServerClientError("Gateway tunnel is not working") except httpx.RequestError as e: diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 25cbbead36..1f6b1ebf3e 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -13,6 +13,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal import settings +from dstack._internal.core.errors import GatewayError from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.common import ApplyAction from dstack._internal.core.models.configurations import ( @@ -2299,13 +2300,13 @@ async def test_returns_400_if_runs_active( @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) class TestSubmitService: @pytest.fixture(autouse=True) - def mock_gateway_connections(self) -> Generator[None, None, None]: + def mock_gateway_connection(self) -> Generator[AsyncMock, None, None]: with patch( "dstack._internal.server.services.gateways.gateway_connections_pool.get_or_add" ) as get_conn_mock: get_conn_mock.return_value.client = Mock() get_conn_mock.return_value.client.return_value = AsyncMock() - yield + yield get_conn_mock @pytest.mark.asyncio @pytest.mark.parametrize( @@ -2481,3 +2482,54 @@ async def test_return_error_if_specified_gateway_is_true_and_no_gateway_exists( } ] } + + @pytest.mark.asyncio + async def test_unregister_dangling_service( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + mock_gateway_connection: AsyncMock, + ) -> None: + user = await create_user(session=session, global_role=GlobalRole.USER) + project = await create_project(session=session, owner=user, name="test-project") + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo(session=session, project_id=project.id) + backend = await create_backend(session=session, project_id=project.id) + gateway_compute = await create_gateway_compute(session=session, backend_id=backend.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + gateway_compute_id=gateway_compute.id, + status=GatewayStatus.RUNNING, + wildcard_domain="example.com", + ) + project.default_gateway_id = gateway.id + await session.commit() + + client_mock = ( + mock_gateway_connection.return_value.client.return_value.__aenter__.return_value + ) + client_mock.register_service.side_effect = [ + GatewayError("Service test-project/test-service is already registered"), + None, # Second call succeeds + ] + + response = await client.post( + "/api/project/test-project/runs/submit", + headers=get_auth_headers(user.token), + json={"run_spec": get_service_run_spec(repo_id=repo.name, run_name="test-service")}, + ) + + assert response.status_code == 200 + assert response.json()["service"]["url"] == "https://test-service.example.com" + # Verify that unregister_service was called to clean up the dangling service + client_mock.unregister_service.assert_called_once_with( + project=project.name, + run_name="test-service", + ) + # Verify that register_service was called twice (first failed, then succeeded) + assert client_mock.register_service.call_count == 2 From 5fd654551bf98c9b6c86a7a41ccd4fe52108dbb0 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 19 Feb 2026 09:09:50 +0000 Subject: [PATCH 151/187] [runner] Check capabilities(7) (#3587) --- runner/go.mod | 2 + runner/go.sum | 4 ++ runner/internal/executor/executor.go | 34 ++++++++---- .../linux/capabilities/capabilities_darwin.go | 22 ++++++++ .../linux/capabilities/capabilities_linux.go | 52 +++++++++++++++++++ 5 files changed, 103 insertions(+), 11 deletions(-) create mode 100644 runner/internal/linux/capabilities/capabilities_darwin.go create mode 100644 runner/internal/linux/capabilities/capabilities_linux.go diff --git a/runner/go.mod b/runner/go.mod index 260fb880ae..f338e78edd 100644 --- a/runner/go.mod +++ b/runner/go.mod @@ -23,6 +23,7 @@ require ( github.com/urfave/cli/v3 v3.6.1 golang.org/x/crypto v0.22.0 golang.org/x/sys v0.26.0 + kernel.org/pub/linux/libs/security/libcap/cap v1.2.77 ) require ( @@ -84,4 +85,5 @@ require ( gopkg.in/warnings.v0 v0.1.2 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect gotest.tools/v3 v3.5.1 // indirect + kernel.org/pub/linux/libs/security/libcap/psx v1.2.77 // indirect ) diff --git a/runner/go.sum b/runner/go.sum index 20c4568f9f..655ea59dc0 100644 --- a/runner/go.sum +++ b/runner/go.sum @@ -321,3 +321,7 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU= gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= +kernel.org/pub/linux/libs/security/libcap/cap v1.2.77 h1:iQtQTjFUOcTT19fI8sTCzYXsjeVs56et3D8AbKS2Uks= +kernel.org/pub/linux/libs/security/libcap/cap v1.2.77/go.mod h1:oV+IO8kGh0B7TxErbydDe2+BRmi9g/W0CkpVV+QBTJU= +kernel.org/pub/linux/libs/security/libcap/psx v1.2.77 h1:Z06sMOzc0GNCwp6efaVrIrz4ywGJ1v+DP0pjVkOfDuA= +kernel.org/pub/linux/libs/security/libcap/psx v1.2.77/go.mod h1:+l6Ee2F59XiJ2I6WR5ObpC1utCQJZ/VLsEbQCD8RG24= diff --git a/runner/internal/executor/executor.go b/runner/internal/executor/executor.go index ea2ef63930..85a59408fb 100644 --- a/runner/internal/executor/executor.go +++ b/runner/internal/executor/executor.go @@ -26,6 +26,7 @@ import ( "github.com/dstackai/dstack/runner/consts" "github.com/dstackai/dstack/runner/internal/common" "github.com/dstackai/dstack/runner/internal/connections" + cap "github.com/dstackai/dstack/runner/internal/linux/capabilities" linuxuser "github.com/dstackai/dstack/runner/internal/linux/user" "github.com/dstackai/dstack/runner/internal/log" "github.com/dstackai/dstack/runner/internal/schemas" @@ -467,10 +468,19 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error } cmd.Dir = ex.jobWorkingDir - // Strictly speaking, we need CAP_SETUID and CAP_GUID (for Cmd.Start()-> - // Cmd.SysProcAttr.Credential) and CAP_CHOWN (for startCommand()->os.Chown()), - // but for the sake of simplicity we instead check if we are root or not - if ex.currentUser.IsRoot() { + // CAP_SET{UID,GID} for startCommand() -> Cmd.Start() -> set{uid,gid,groups} syscalls during fork-exec + // CAP_CHOWN for startCommand() -> os.Chown(pts.Name()) + if missing, err := cap.Check(cap.SETUID, cap.SETGID, cap.CHOWN); err != nil { + log.Error( + ctx, "Failed to check capabilities, won't try to set process credentials", + "err", err, "user", ex.currentUser, + ) + } else if len(missing) > 0 { + log.Info( + ctx, "Required capabilities are missing, cannot set process credentials", + "missing", missing, "user", ex.currentUser, + ) + } else { log.Trace(ctx, "Using credentials", "user", ex.jobUser) if cmd.SysProcAttr == nil { cmd.SysProcAttr = &syscall.SysProcAttr{} @@ -480,8 +490,6 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error return fmt.Errorf("prepare process credentials: %w", err) } cmd.SysProcAttr.Credential = creds - } else { - log.Info(ctx, "Current user is not root, cannot set process credentials", "user", ex.currentUser) } envMap := NewEnvMap(ParseEnvList(os.Environ()), jobEnvs, ex.secrets) @@ -509,11 +517,15 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error // Note: we already set RLIMIT_MEMLOCK to unlimited in the shim if we've detected IB devices // (see configureHpcNetworkingIfAvailable() function), but, as it's on the shim side, it only works // with VM-based backends. - rlimitMemlock := unix.Rlimit{Cur: unix.RLIM_INFINITY, Max: unix.RLIM_INFINITY} - // TODO: Check if we have CAP_SYS_RESOURCE. In container environments, even root usually doesn't have - // this capability. - if err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, &rlimitMemlock); err != nil { - log.Error(ctx, "Failed to set resource limits", "err", err) + if ok, err := cap.Has(cap.SYS_RESOURCE); err != nil { + log.Error(ctx, "Failed to check capabilities, won't try to set resource limits", "err", err) + } else if !ok { + log.Info(ctx, "Required capability is missing, cannot set resource limits", "missing", cap.SYS_RESOURCE) + } else { + rlimitMemlock := unix.Rlimit{Cur: unix.RLIM_INFINITY, Max: unix.RLIM_INFINITY} + if err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, &rlimitMemlock); err != nil { + log.Error(ctx, "Failed to set resource limits", "err", err) + } } // HOME must be added after writeDstackProfile to avoid overriding the correct per-user value set by sshd diff --git a/runner/internal/linux/capabilities/capabilities_darwin.go b/runner/internal/linux/capabilities/capabilities_darwin.go new file mode 100644 index 0000000000..6a60f94af1 --- /dev/null +++ b/runner/internal/linux/capabilities/capabilities_darwin.go @@ -0,0 +1,22 @@ +//go:build darwin + +package capabilities + +import "errors" + +type Capability string + +const ( + SETUID = Capability("SETUID") + SETGID = Capability("SETGID") + CHOWN = Capability("CHOWN") + SYS_RESOURCE = Capability("SYS_RESOURCE") +) + +func Has(c Capability) (bool, error) { + return false, errors.New("not supported") +} + +func Check(cs ...Capability) (missing []Capability, err error) { + return nil, errors.New("not supported") +} diff --git a/runner/internal/linux/capabilities/capabilities_linux.go b/runner/internal/linux/capabilities/capabilities_linux.go new file mode 100644 index 0000000000..c27e887a5d --- /dev/null +++ b/runner/internal/linux/capabilities/capabilities_linux.go @@ -0,0 +1,52 @@ +//go:build linux + +package capabilities + +import ( + "strings" + + "kernel.org/pub/linux/libs/security/libcap/cap" +) + +type Capability cap.Value + +const ( + SETUID = Capability(cap.SETUID) + SETGID = Capability(cap.SETGID) + CHOWN = Capability(cap.CHOWN) + SYS_RESOURCE = Capability(cap.SYS_RESOURCE) +) + +// String returns a text representation of the capability in the form used by container folks: +// UPPER_CASE, no CAP_ prefix: cap_sys_admin -> SYS_ADMIN +func (c Capability) String() string { + return strings.ToUpper(cap.Value(c).String()[4:]) +} + +// Has returns true if the current process has the specified capability in its effective set +func Has(c Capability) (bool, error) { + set, err := cap.GetPID(0) + if err != nil { + return false, err + } + return set.GetFlag(cap.Effective, cap.Value(c)) +} + +// Check checks and returns those capabilities that are _missing_ from the effective set +// of the current process +func Check(cs ...Capability) (missing []Capability, err error) { + set, err := cap.GetPID(0) + if err != nil { + return nil, err + } + for _, c := range cs { + ok, err := set.GetFlag(cap.Effective, cap.Value(c)) + if err != nil { + return nil, err + } + if !ok { + missing = append(missing, c) + } + } + return missing, nil +} From 008efc891c505041823ee5e0f79809f0c8402578 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Thu, 19 Feb 2026 09:10:11 +0000 Subject: [PATCH 152/187] [runner] Check if repo dir exists before chown (#3589) The check is added to avoid the following log message when no repo specified or the repo is empty: > Error while walking repo dir path=/workflow err=lstat /workflow: > no such file or directory In addition, walk/chown errors log level is changed to warning to highlight possible issues. --- runner/internal/executor/repo.go | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/runner/internal/executor/repo.go b/runner/internal/executor/repo.go index 467c783a88..dd16092be9 100644 --- a/runner/internal/executor/repo.go +++ b/runner/internal/executor/repo.go @@ -236,16 +236,25 @@ func (ex *RunExecutor) restoreRepoDir(ctx context.Context, tmpDir string) error func (ex *RunExecutor) chownRepoDir(ctx context.Context) error { log.Trace(ctx, "Chowning repo dir") + exists, err := common.PathExists(ex.repoDir) + // We consider all errors here non-fatal + if err != nil { + log.Warning(ctx, "Failed to check if repo dir exists", "err", err) + return nil + } + if !exists { + log.Trace(ctx, "Repo dir does not exist") + return nil + } return filepath.WalkDir( ex.repoDir, func(p string, d fs.DirEntry, err error) error { - // We consider walk/chown errors non-fatal if err != nil { - log.Debug(ctx, "Error while walking repo dir", "path", p, "err", err) + log.Warning(ctx, "Error while walking repo dir", "path", p, "err", err) return nil } if err := os.Chown(p, ex.jobUser.Uid, ex.jobUser.Gid); err != nil { - log.Debug(ctx, "Error while chowning repo dir", "path", p, "err", err) + log.Warning(ctx, "Error while chowning repo dir", "path", p, "err", err) } return nil }, From e88e25f9fe320bac64bf9c967635a17e55faa4a8 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Thu, 19 Feb 2026 17:29:23 +0500 Subject: [PATCH 153/187] Fix concurrent indexes migration (#3591) --- src/dstack/_internal/server/migrations/env.py | 2 +- ...f90_add_pipeline_indexes_for_compute_and_.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/dstack/_internal/server/migrations/env.py b/src/dstack/_internal/server/migrations/env.py index 81d8ba0694..c7c27f1f8b 100644 --- a/src/dstack/_internal/server/migrations/env.py +++ b/src/dstack/_internal/server/migrations/env.py @@ -73,7 +73,7 @@ def run_migrations(connection: Connection): # lock_timeout is needed so that migrations that acquire locks # do not wait for locks forever, blocking live queries. # Better to fail and retry a deployment. - connection.execute(text("SET lock_timeout='10s';")) + connection.execute(text("SET lock_timeout='15s';")) connection.commit() context.configure( connection=connection, diff --git a/src/dstack/_internal/server/migrations/versions/2026/02_18_1122_a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py b/src/dstack/_internal/server/migrations/versions/2026/02_18_1122_a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py index ad35a23d06..44cc2846ec 100644 --- a/src/dstack/_internal/server/migrations/versions/2026/02_18_1122_a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py +++ b/src/dstack/_internal/server/migrations/versions/2026/02_18_1122_a8ed24fd7f90_add_pipeline_indexes_for_compute_and_.py @@ -19,6 +19,21 @@ def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### with op.get_context().autocommit_block(): + if op.get_context().dialect.name == "postgresql": + # Concurrent index ops can fail midway, leaving invalid indexes behind. + # Use DROP INDEX IF EXISTS so the migration can be retried safely. + op.drop_index( + "ix_compute_groups_pipeline_fetch_q", + table_name="compute_groups", + if_exists=True, + postgresql_concurrently=True, + ) + op.drop_index( + "ix_placement_groups_pipeline_fetch_q", + table_name="placement_groups", + if_exists=True, + postgresql_concurrently=True, + ) op.create_index( "ix_compute_groups_pipeline_fetch_q", "compute_groups", @@ -47,11 +62,13 @@ def downgrade() -> None: op.drop_index( "ix_placement_groups_pipeline_fetch_q", "placement_groups", + if_exists=True, postgresql_concurrently=True, ) op.drop_index( "ix_compute_groups_pipeline_fetch_q", "compute_groups", + if_exists=True, postgresql_concurrently=True, ) # ### end Alembic commands ### From 8764de4807b9dc9703f60670c0c14d033e5dcfbf Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Thu, 19 Feb 2026 18:10:53 +0500 Subject: [PATCH 154/187] Document Adding indexes (#3594) --- contributing/MIGRATIONS.md | 26 ++++++++++++++++++++++++++ contributing/RELEASE.md | 5 ++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/contributing/MIGRATIONS.md b/contributing/MIGRATIONS.md index 8918d9e36b..f494d829d5 100644 --- a/contributing/MIGRATIONS.md +++ b/contributing/MIGRATIONS.md @@ -47,3 +47,29 @@ These steps apply to **renaming a column** or **changing the type of a column** ### Altering multiple tables Altering a table requires Postgres to [take an ACCESS EXCLUSIVE lock](https://www.postgresql.org/docs/current/sql-altertable.html). (This applies not only to statements that rewrite the tables but also to statements that modify tables metadata.) Altering multiple tables can cause deadlocks due to conflict with read operations since the `dstack` server does not define an order for read operations. Altering multiple tables should be done in separate transactions/migrations. + +### Adding indexes + +Use `CREATE INDEX CONCURRENTLY` to avoid tacking exclusive lock on the table for a long time. +For migrations that create multiple indexes, failures can leave the schema in a partial state +(some indexes already created, some missing). On Postgres, concurrent index creation can also fail +midway and leave an invalid index object with the same name. Retrying the migration then fails +with "already exists". + +For retry-safe migrations, pre-drop indexes with `if_exists=True` before creating them again: + +```python +with op.get_context().autocommit_block(): + op.drop_index( + "ix_table_col", + table_name="table", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_table_col", + "table", + ["col"], + postgresql_concurrently=True, + ) +``` diff --git a/contributing/RELEASE.md b/contributing/RELEASE.md index 56df59595e..f0798082fc 100644 --- a/contributing/RELEASE.md +++ b/contributing/RELEASE.md @@ -8,7 +8,10 @@ This is a `dstack` release guide and checklist for core maintainers. 1. Compare changes to the previous release, e.g. [`https://github.com/dstackai/dstack/compare/0.19.39...master`](https://github.com/dstackai/dstack/compare/0.19.39...master). 2. Test that `master` CLI works with the previous server release. PRs that add new model fields can potentially break client backward compatibility. 3. Test that `master` server works with the previous CLI release. - 4. Pay special attention to releases with DB migrations. Migrations should work with rolling deployments and avoid locking multiple tables. See [MIGRATIONS.md](MIGRATIONS.md). + 4. Pay special attention to releases with DB migrations. See [MIGRATIONS.md](MIGRATIONS.md). + * Ensure migrations work with rolling deployments and do not lock multiple tables. + * Test applying migrations while old replicas do active processing. + * Test migrations can be retried if they fail. For example, concurrent index may fail and stay in invalid state. 2. Create a tag, e.g. `git tag 0.19.40`. 3. Push the tag to trigger the Release `workflow`, i.e. `git push --tags`. 4. Generate GitHub release notes from the tag. Highlight major features, deprecations, breaking changes. From be7daeb9853e05648dcf2d2bcdfdaa974fab81b7 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Thu, 19 Feb 2026 16:33:43 +0100 Subject: [PATCH 155/187] [Blog] Model inference with Prefill-Decode disaggregation (#3595) --- docs/blog/posts/pd-disaggregation.md | 141 +++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 docs/blog/posts/pd-disaggregation.md diff --git a/docs/blog/posts/pd-disaggregation.md b/docs/blog/posts/pd-disaggregation.md new file mode 100644 index 0000000000..10542b2208 --- /dev/null +++ b/docs/blog/posts/pd-disaggregation.md @@ -0,0 +1,141 @@ +--- +title: "Model inference with Prefill-Decode disaggregation" +date: 2026-02-19 +description: "TBA" +slug: pd-disaggregation +image: https://dstack.ai/static-assets/static-assets/images/dstack-pd-disaggregation.png +categories: + - Changelog +links: + - SGLang router integration: https://dstack.ai/blog/sglang-router/ +--- + +# Model inference with Prefill-Decode disaggregation + +While `dstack` started as a GPU-native orchestrator for development and training, over the last year it has increasingly brought inference to the forefront — making serving a first-class citizen. + + + +At the end of last year, we introduced [SGLang router](../posts/sglang-router.md) integration — bringing cache-aware routing to [services](../../docs/concepts/services.md). Today, building on that integration, we’re adding native Prefill–Decode (PD) disaggregation. + + + +Unlike many PD disaggregation setups tied to Kubernetes as the control plane, dstack does not depend on Kubernetes. It’s an open-source, GPU-native orchestrator that can provision GPUs directly in your cloud accounts or on bare-metal infrastructure — while also running on top of existing Kubernetes clusters if needed. + +For inference, `dstack` provides a [services](../../docs/concepts/services.md) abstraction. While remaining framework-agnostic, we integrate more deeply with leading open-source frameworks — [SGLang](https://github.com/sgl-project/sglang) being one of them for model inference. + +> If you’re new to Prefill–Decode disaggregation, see the official [SGLang docs](https://docs.sglang.io/advanced_features/pd_disaggregation.html). + +## Services + +With `dstack` `0.20.10`, you can define a service with separate replica groups for Prefill and Decode workers and enable PD disaggregation directly in the `router` configuration. + +

    + +```yaml +type: service +name: glm45air + +env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + +image: lmsysorg/sglang:latest + +replicas: + - count: 1..4 + scaling: + metric: rps + target: 3 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 8000 \ + --disaggregation-bootstrap-port 8998 + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 8000 + resources: + gpu: H200 + +port: 8000 +model: zai-org/GLM-4.5-Air-FP8 + +probes: + - type: http + url: /health_generate + interval: 15s + +router: + type: sglang + pd_disaggregation: true +``` + +
    + +Deploy it as usual: + +
    + +```shell +$ HF_TOKEN=... +$ dstack apply -f glm45air.dstack.yml +``` + +
    + +### Gateway + +Just like `dstack` relies on the SGLang router for cache-aware routing, Prefill–Decode disaggregation also requires a [gateway](../../docs/concepts/gateways.md#sglang) configured with the SGLang router. + +
    + +```yaml +type: gateway +name: inference-gateway + +backends: [kubernetes] +region: any + +domain: example.com + +router: + type: sglang + policy: cache_aware +``` + +
    + +## Limitations + +* Because the SGLang router requires all workers to be on the same network, and `dstack` currently runs the router inside the gateway, the gateway and the service must be running in the same cluster. +* Prefill–Decode disaggregation is currently available with the SGLang backend (vLLM support is coming). +* Autoscaling supports RPS as the metric for now; TTFT and ITL metrics are planned next. + +With native support for inference and now Prefill–Decode disaggregation, `dstack` makes it easier to run high-throughput, low-latency model serving across GPU clouds, and Kubernetes or bare-metal clusters. + +## What's next? + +We’re working on PD disaggregation benchmarks and tuning guidance — coming soon. + +In the meantime: + +1. Read about [services](../../docs/concepts/services.md), [gateways](../../docs/concepts/gateways.md), and [fleets](../../docs/concepts/fleets.md) +2. Check out [Quickstart](../../docs/quickstart.md) +3. Join [Discord](https://discord.gg/u8SmfwPpMd) From c9cf54ed698061d9c20ddcd442e1dca1be233556 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Fri, 20 Feb 2026 08:02:17 +0000 Subject: [PATCH 156/187] [runner] Drop buildLDLibraryPathEnv() (#3593) This code is: - conda-specific - dead Basically, the function (ab)uses python3-config [1] utility (which, BTW, is not present in dstack base images since conda -> uv migration) to calculate the path to conda-installed shared objects and export it via LD_LIBRARY_PATH (the proper way to add conda-installed libs would be to use ld.so's configuration files, that is, ld.so.conf.d/*). Although this code was added in https://github.com/dstackai/dstack/pull/1354, it is not related to TPU support at all. [1]: https://manpages.debian.org/testing/python3-dev/python3-config.1.en.html --- runner/internal/executor/executor.go | 46 +--------------------------- 1 file changed, 1 insertion(+), 45 deletions(-) diff --git a/runner/internal/executor/executor.go b/runner/internal/executor/executor.go index 85a59408fb..61e18ee3e9 100644 --- a/runner/internal/executor/executor.go +++ b/runner/internal/executor/executor.go @@ -444,15 +444,6 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error "DSTACK_MPI_HOSTFILE": mpiHostfilePath, } - // Call buildLDLibraryPathEnv and update jobEnvs if no error occurs - newLDPath, err := buildLDLibraryPathEnv(ctx) - if err != nil { - log.Info(ctx, "Continuing without updating LD_LIBRARY_PATH") - } else { - jobEnvs["LD_LIBRARY_PATH"] = newLDPath - log.Info(ctx, "New LD_LIBRARY_PATH set", "LD_LIBRARY_PATH", newLDPath) - } - cmd := exec.CommandContext(ctx, ex.jobSpec.Commands[0], ex.jobSpec.Commands[1:]...) cmd.Cancel = func() error { // returns error on Windows @@ -504,8 +495,7 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error log.Warning(ctx, "failed to include dstack_profile", "path", profilePath, "err", err) } - err = writeMpiHostfile(ctx, ex.clusterInfo.JobIPs, gpusPerNodeNum, mpiHostfilePath) - if err != nil { + if err := writeMpiHostfile(ctx, ex.clusterInfo.JobIPs, gpusPerNodeNum, mpiHostfilePath); err != nil { return fmt.Errorf("write MPI hostfile: %w", err) } @@ -621,40 +611,6 @@ func isPtyError(err error) bool { return errors.As(err, &e) && errors.Is(e.Err, syscall.EIO) } -func buildLDLibraryPathEnv(ctx context.Context) (string, error) { - // Execute shell command to get Python prefix - cmd := exec.CommandContext(ctx, "bash", "-i", "-c", "python3-config --prefix") - output, err := cmd.Output() - if err != nil { - return "", fmt.Errorf("error executing command: %w", err) - } - - // Extract and trim the prefix path - prefixPath := strings.TrimSpace(string(output)) - - // Check if the prefix path exists - if _, err := os.Stat(prefixPath); os.IsNotExist(err) { - return "", fmt.Errorf("python prefix path does not exist: %s", prefixPath) - } - - // Construct the path to Python's shared libraries - sharedLibPath := fmt.Sprintf("%s/lib", prefixPath) - - // Get current LD_LIBRARY_PATH - currentLDPath := os.Getenv("LD_LIBRARY_PATH") - - // Append Python's shared library path if not already present - if !strings.Contains(currentLDPath, sharedLibPath) { - if currentLDPath == "" { - currentLDPath = sharedLibPath - } else { - currentLDPath = fmt.Sprintf("%s:%s", currentLDPath, sharedLibPath) - } - } - - return currentLDPath, nil -} - // A simplified copypasta of creack/pty Start->StartWithSize->StartWithAttrs // with two additions: // * controlling terminal is properly set (cmd.Extrafiles, Cmd.SysProcAttr.Ctty) From 7c4314b714f57941ba692ea67f4c8ac4cd13a2db Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Fri, 20 Feb 2026 08:43:26 +0000 Subject: [PATCH 157/187] Fix mutually exclusive fields validation (#3598) Pydantic validators should raise `ValueError`, `TypeError`, or `AssertionError`, not `KeyError`. --- src/dstack/_internal/core/models/configurations.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 93c63e6b31..345f082f3d 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -546,7 +546,7 @@ class BaseRunConfiguration(CoreModel): @validator("python", pre=True, always=True) def convert_python(cls, v, values) -> Optional[PythonVersion]: if v is not None and values.get("image"): - raise KeyError("`image` and `python` are mutually exclusive fields") + raise ValueError("`image` and `python` are mutually exclusive fields") if isinstance(v, float): v = str(v) if v == "3.1": @@ -558,11 +558,11 @@ def convert_python(cls, v, values) -> Optional[PythonVersion]: @validator("docker", pre=True, always=True) def _docker(cls, v, values) -> Optional[bool]: if v is True and values.get("image"): - raise KeyError("`image` and `docker` are mutually exclusive fields") + raise ValueError("`image` and `docker` are mutually exclusive fields") if v is True and values.get("python"): - raise KeyError("`python` and `docker` are mutually exclusive fields") + raise ValueError("`python` and `docker` are mutually exclusive fields") if v is True and values.get("nvcc"): - raise KeyError("`nvcc` and `docker` are mutually exclusive fields") + raise ValueError("`nvcc` and `docker` are mutually exclusive fields") # Ideally, we'd like to also prohibit privileged=False when docker=True, # but it's not possible to do so without breaking backwards compatibility. return v From 20771c38ba255d1db663404b7429942385b8692b Mon Sep 17 00:00:00 2001 From: Bihan Rana Date: Fri, 20 Feb 2026 19:39:52 +0545 Subject: [PATCH 158/187] [Docs] PD disaggregation (#3592) * Add pd-disaggregation docs * Add pd.dstack.yml file * Minor update * Update gateway and services docs * [Docs] Minor changes related to PD disaggregation --------- Co-authored-by: Bihan Rana Co-authored-by: peterschmidt85 --- docs/docs/concepts/gateways.md | 9 ++- docs/docs/concepts/services.md | 7 +- examples/inference/sglang/README.md | 103 ++++++++++++++++++++++-- examples/inference/sglang/pd.dstack.yml | 51 ++++++++++++ 4 files changed, 158 insertions(+), 12 deletions(-) create mode 100644 examples/inference/sglang/pd.dstack.yml diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md index 55573bd747..6ed19c2a09 100644 --- a/docs/docs/concepts/gateways.md +++ b/docs/docs/concepts/gateways.md @@ -110,7 +110,11 @@ router: -!!! info "Policy" +If you configure the `sglang` router, [services](../concepts/services.md) can run either [standard SGLang workers](../../examples/inference/sglang/index.md) or [Prefill-Decode workers](../../examples/inference/sglang/index.md#pd-disaggregation) (aka PD disaggregation). + +> Note, if you want to run services with PD disaggregation, the gateway must currently run in the same cluster as the service. + +??? info "Policy" The `policy` property allows you to configure the routing policy: * `cache_aware` — Default policy; combines cache locality with load balancing, falling back to shortest queue. @@ -119,9 +123,6 @@ router: * `round_robin` — Cycles through workers in order. -> Currently, services using this type of gateway must run standard SGLang workers. See the [example](../../examples/inference/sglang/index.md). -> -> Support for prefill/decode disaggregation and auto-scaling based on inter-token latency is coming soon. ### Public IP diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index d40984866b..1eb63dd01e 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -182,6 +182,8 @@ Setting the minimum number of replicas to `0` allows the service to scale down t > The `scaling` property requires creating a [gateway](gateways.md). + + ??? info "Replica groups" A service can include multiple replica groups. Each group can define its own `commands`, `resources` requirements, and `scaling` rules. @@ -230,8 +232,9 @@ Setting the minimum number of replicas to `0` allows the service to scale down t > Properties such as `regions`, `port`, `image`, `env` and some other cannot be configured per replica group. This support is coming soon. -??? info "Disaggregated serving" - Native support for disaggregated prefill and decode, allowing both worker types to run within a single service, is coming soon. +### PD disaggregation + +If you create a gateway with the [`sglang` router](gateways.md#sglang), you can run SGLang with [Prefill-Decode disaggregation](https://docs.sglang.io/advanced_features/pd_disaggregation.html). See the [corresponding example](../../examples/inference/sglang/index.md#pd-disaggregation). ### Authorization diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md index 5b7dc640a0..6549afe5c5 100644 --- a/examples/inference/sglang/README.md +++ b/examples/inference/sglang/README.md @@ -9,7 +9,7 @@ This example shows how to deploy DeepSeek-R1-Distill-Llama 8B and 70B using [SGL ## Apply a configuration -Here's an example of a service that deploys DeepSeek-R1-Distill-Llama 8B and 70B using SgLang. +Here's an example of a service that deploys DeepSeek-R1-Distill-Llama 8B and 70B using SGLang. === "NVIDIA" @@ -108,15 +108,106 @@ curl http://127.0.0.1:3000/proxy/services/main/deepseek-r1/v1/chat/completions \ ``` -!!! info "SGLang Model Gateway" - If you'd like to use a custom routing policy, e.g. by leveraging the [SGLang Model Gateway](https://docs.sglang.ai/advanced_features/router.html#), create a gateway with `router` set to `sglang`. Check out [gateways](https://dstack.ai/docs/concepts/gateways#router) for more details. +!!! info "Router policy" + If you'd like to use a custom routing policy, create a gateway with `router` set to `sglang`. Check out [gateways](https://dstack.ai/docs/concepts/gateways#router) for more details. -> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling or HTTPs, rate-limits, etc), the service endpoint will be available at `https://deepseek-r1./`. +> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://deepseek-r1./`. + +## Configuration options + +### PD disaggregation + +If you create a gateway with the [`sglang` router](https://dstack.ai/docs/concepts/gateways/#sglang), you can run SGLang with [PD disaggregation](https://docs.sglang.io/advanced_features/pd_disaggregation.html). + +
    + +```yaml +type: service +name: prefill-decode +image: lmsysorg/sglang:latest + +env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + +replicas: + - count: 1..4 + scaling: + metric: rps + target: 3 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 8000 \ + --disaggregation-bootstrap-port 8998 + resources: + gpu: H200 + + - count: 1..8 + scaling: + metric: rps + target: 2 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 8000 + resources: + gpu: H200 + +port: 8000 +model: zai-org/GLM-4.5-Air-FP8 + +# Custom probe is required for PD disaggregation +probes: + - type: http + url: /health_generate + interval: 15s + +router: + type: sglang + pd_disaggregation: true +``` + +
    + +Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics are coming soon. + +#### Gateway + +Note, running services with PD disaggregation currently requires the gateway to run in the same cluster as the service. + +For example, if you run services on the `kubernetes` backend, make sure to also create the gateway in the same backend: + +
    + +```yaml +type: gateway +name: gateway-name + +backend: kubernetes +region: any + +domain: example.com +router: + type: sglang +``` + +
    + + ## Source code -The source-code of this example can be found in -[`examples/llms/deepseek/sglang`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/sglang). +The source-code of these examples can be found in +[`examples/llms/deepseek/sglang`](https://github.com/dstackai/dstack/blob/master/examples/llms/deepseek/sglang) and [`examples/inference/sglang`](https://github.com/dstackai/dstack/blob/master/examples/inference/sglang). ## What's next? diff --git a/examples/inference/sglang/pd.dstack.yml b/examples/inference/sglang/pd.dstack.yml new file mode 100644 index 0000000000..614d4e72b5 --- /dev/null +++ b/examples/inference/sglang/pd.dstack.yml @@ -0,0 +1,51 @@ +type: service +name: prefill-decode +image: lmsysorg/sglang:latest + +env: + - HF_TOKEN + - MODEL_ID=zai-org/GLM-4.5-Air-FP8 + +replicas: + - count: 1..4 + scaling: + metric: rps + target: 3 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 8000 \ + --disaggregation-bootstrap-port 8998 + resources: + gpu: 1 + + - count: 1..8 + scaling: + metric: rps + target: 2 + commands: + - | + python -m sglang.launch_server \ + --model-path $MODEL_ID \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend mooncake \ + --host 0.0.0.0 \ + --port 8000 + resources: + gpu: 1 + +port: 8000 +model: zai-org/GLM-4.5-Air-FP8 + +probes: + - type: http + url: /health_generate + interval: 15s + +router: + type: sglang + pd_disaggregation: true From 41079ff717894663222b0047b4e8887211c748f9 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Fri, 20 Feb 2026 14:33:19 +0000 Subject: [PATCH 159/187] [Docs] Clarify how K8s resources and offers work (#3565) Co-authored-by: peterschmidt85 --- docs/docs/concepts/backends.md | 38 +++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index 9dc2ebb573..0213b669d4 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -1060,7 +1060,43 @@ projects: Ensure you've created a ClusterRoleBinding to grant the role to the user or the service account you're using. -> To learn more, see the [Lambda](../../examples/clusters/lambda/#kubernetes) and [Lambda](../../examples/clusters/crusoe/#kubernetes) examples. +??? info "Resources and offers" + If you use ranges with [`resources`](../concepts/tasks.md#resources) (e.g. `gpu: 1..8` or `memory: 64GB..`) in fleet or run configurations, other backends collect and try all offers that satisfy the range. + + The `kubernetes` backend handles it differently. + + * For `gpu`, if you specify a range (e.g. `gpu: 4..8`), the `kubernetes` backend only provisions pods with the GPU count equal to the lower limit (`4`). The upper limit of the GPU range is always ignored. + * For other resources such as `cpu`, `memory`, and `disk`, the `kubernetes` backend passes the lower and upper limits of the range as Kubernetes [requests and limits](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#requests-and-limits) respectively. If the upper limit is not set, the Kubernetes limit is also not set. + + Example: + +
    + + ```yaml + type: dev-environment + ide: vscode + + resources: + cpu: 32..64 + memory: 1024GB + disk: 100GB.. + gpu: nvidia:4..8 + ``` + +
    + + This translates to the following Kubernetes resource spec: + + | Resource | Request | Limit | + |---------------------|----------|-----------| + | `cpu` | `32` | `64` | + | `memory` | `1024Gi` | `1024Gi` | + | `ephemeral-storage` | `100Gi` | _not set_ | + | `nvidia.com/gpu` | `4` | `4` | + + This applies to offers shown in `dstack apply` (run plans), during provisioning, and in `dstack offer`. Unlike other backends, offers for the `kubernetes` backend always reflect the lower limit of the range. + +> To learn more, see the [Lambda](../../examples/clusters/lambda/#kubernetes) and [Crusoe](../../examples/clusters/crusoe/#kubernetes) examples. ### RunPod From 768bdc1e966ecdea0f8a1c96d9ff4cf262335907 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Fri, 20 Feb 2026 16:08:02 +0100 Subject: [PATCH 160/187] Allow `https: auto` for services (#3600) * Add https: auto mode for services Allow setting `https` to `auto` in service configuration. When set to `auto`, the effective HTTPS setting is resolved at registration time based on the gateway's certificate configuration: enabled for Let's Encrypt, disabled for no certificate or ACM. The default remains `true` for backward compatibility. Co-authored-by: Cursor * Update gateway docs: add Domain and Certificate sections, document private gateways - Add "Domain" section explaining domain requirement and DNS setup - Add "Certificate" section covering lets-encrypt, acm, and null options - Expand "Public IP" with private gateway example - Move "Instance type" after "Public IP" - Update certificate field description to mention null - Add null note to gateway reference page --- docs/docs/concepts/gateways.md | 91 ++++++++++------ docs/docs/reference/dstack.yml/gateway.md | 2 + .../_internal/core/models/configurations.py | 10 +- src/dstack/_internal/core/models/gateways.py | 5 +- .../server/services/services/__init__.py | 12 ++- .../server/services/services/test_services.py | 100 ++++++++++++++++++ 6 files changed, 181 insertions(+), 39 deletions(-) create mode 100644 src/tests/_internal/server/services/services/test_services.py diff --git a/docs/docs/concepts/gateways.md b/docs/docs/concepts/gateways.md index 6ed19c2a09..c15eb60c21 100644 --- a/docs/docs/concepts/gateways.md +++ b/docs/docs/concepts/gateways.md @@ -32,8 +32,6 @@ domain: example.com -A domain name is required to create a gateway. - To create or update the gateway, simply call the [`dstack apply`](../reference/cli/dstack/apply.md) command:
    @@ -53,6 +51,12 @@ Provisioning... ## Configuration options +### Domain + +A gateway requires a `domain` to be specified in the configuration before creation. The domain is used to generate service endpoints (e.g. `.`). + +Once the gateway is created and assigned a hostname, configure your DNS by adding a wildcard record for `*.` (e.g. `*.example.com`). The record should point to the gateway's hostname and should be of type `A` if the hostname is an IP address (most cases), or of type `CNAME` if the hostname is another domain (some private gateways and Kubernetes). + ### Backend You can create gateways with the `aws`, `azure`, `gcp`, or `kubernetes` backends, but that does not limit where services run. A gateway can use one backend while services run on any other backend supported by dstack, including backends where gateways themselves cannot be created. @@ -61,27 +65,6 @@ You can create gateways with the `aws`, `azure`, `gcp`, or `kubernetes` backends Gateways in `kubernetes` backend require an external load balancer. Managed Kubernetes solutions usually include a load balancer. For self-hosted Kubernetes, you must provide a load balancer by yourself. -### Instance type - -By default, `dstack` provisions a small, low-cost instance for the gateway. If you expect to run high-traffic services, you can configure a larger instance type using the `instance_type` property. - -
    - -```yaml -type: gateway -name: example-gateway - -backend: aws -region: eu-west-1 - -# (Optional) Override the gateway instance type -instance_type: t3.large - -domain: example.com -``` - -
    - ### Router By default, the gateway uses its own load balancer to route traffic between replicas. However, you can delegate this responsibility to a specific router by setting the `router` property. Currently, the only supported external router is `sglang`. @@ -124,21 +107,65 @@ If you configure the `sglang` router, [services](../concepts/services.md) can ru +### Certificate + +By default, when you run a service with a gateway, `dstack` provisions an SSL certificate via Let's Encrypt for the configured domain. This automatically enables HTTPS for the service endpoint. + +If you disable [public IP](#public-ip) (e.g. to make the gateway private) or if you simply don't need HTTPS, you can set `certificate` to `null`. + +> Note, by default services set [`https`](../reference/dstack.yml/service.md#https) to `true` which requires a certificate. You can set `https` to `auto` to detect if the gateway supports HTTPS or not automatically. + +??? info "Certificate types" + `dstack` supports the following certificate types: + + * `lets-encrypt` (default) — Automatic certificates via [Let's Encrypt](https://letsencrypt.org/). Requires a [public IP](#public-ip). + * `acm` — Certificates managed by [AWS Certificate Manager](https://aws.amazon.com/certificate-manager/). AWS-only. TLS is terminated at the load balancer, not at the gateway. + * `null` — No certificate. Services will use HTTP. + ### Public IP -If you don't need/want a public IP for the gateway, you can set the `public_ip` to `false` (the default value is `true`), making the gateway private. +If you don't need a public IP for the gateway, you can set `public_ip` to `false` (the default is `true`), making the gateway private. + Private gateways are currently supported in `aws` and `gcp` backends. -!!! info "Reference" - For all gateway configuration options, refer to the [reference](../reference/dstack.yml/gateway.md). +
    -## Update DNS records +```yaml +type: gateway +name: private-gateway + +backend: aws +region: eu-west-1 +domain: example.com + +public_ip: false +certificate: null +``` + +
    + +### Instance type + +By default, `dstack` provisions a small, low-cost instance for the gateway. If you expect to run high-traffic services, you can configure a larger instance type using the `instance_type` property. + +
    -Once the gateway is assigned a hostname, go to your domain's DNS settings -and add a DNS record for `*.`, e.g. `*.example.com`. -The record should point to the gateway's hostname shown in `dstack` -and should be of type `A` if the hostname is an IP address (most cases), -or of type `CNAME` if the hostname is another domain (some private gateways and Kubernetes). +```yaml +type: gateway +name: example-gateway + +backend: aws +region: eu-west-1 + +instance_type: t3.large + +domain: example.com +``` + +
    + +!!! info "Reference" + For all gateway configuration options, refer to the [reference](../reference/dstack.yml/gateway.md). ## Manage gateways diff --git a/docs/docs/reference/dstack.yml/gateway.md b/docs/docs/reference/dstack.yml/gateway.md index 1d74c95705..33fbeb4190 100644 --- a/docs/docs/reference/dstack.yml/gateway.md +++ b/docs/docs/reference/dstack.yml/gateway.md @@ -22,6 +22,8 @@ The `gateway` configuration type allows creating and updating [gateways](../../c ### `certificate` +Set to `null` to disable certificates (e.g. for [private gateways](../../concepts/gateways.md#public-ip)). + === "Let's encrypt" #SCHEMA# dstack._internal.core.models.gateways.LetsEncryptGatewayCertificate diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 345f082f3d..87e38b496a 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -856,9 +856,13 @@ class ServiceConfigurationParams(CoreModel): ) ), ] = None - https: Annotated[bool, Field(description="Enable HTTPS if running with a gateway")] = ( - SERVICE_HTTPS_DEFAULT - ) + https: Annotated[ + Union[bool, Literal["auto"]], + Field( + description="Enable HTTPS if running with a gateway." + " Set to `auto` to determine automatically based on gateway configuration" + ), + ] = SERVICE_HTTPS_DEFAULT auth: Annotated[bool, Field(description="Enable the authorization")] = True scaling: Annotated[ diff --git a/src/dstack/_internal/core/models/gateways.py b/src/dstack/_internal/core/models/gateways.py index 816395fc82..7f09d3df18 100644 --- a/src/dstack/_internal/core/models/gateways.py +++ b/src/dstack/_internal/core/models/gateways.py @@ -77,7 +77,10 @@ class GatewayConfiguration(CoreModel): public_ip: Annotated[bool, Field(description="Allocate public IP for the gateway")] = True certificate: Annotated[ Optional[AnyGatewayCertificate], - Field(description="The SSL certificate configuration. Defaults to `type: lets-encrypt`"), + Field( + description="The SSL certificate configuration." + " Set to `null` to disable. Defaults to `type: lets-encrypt`" + ), ] = LetsEncryptGatewayCertificate() tags: Annotated[ Optional[Dict[str, str]], diff --git a/src/dstack/_internal/server/services/services/__init__.py b/src/dstack/_internal/server/services/services/__init__.py index 8dba43ea85..8f0849050f 100644 --- a/src/dstack/_internal/server/services/services/__init__.py +++ b/src/dstack/_internal/server/services/services/__init__.py @@ -22,7 +22,6 @@ ) from dstack._internal.core.models.configurations import ( DEFAULT_REPLICA_GROUP_NAME, - SERVICE_HTTPS_DEFAULT, ServiceConfiguration, ) from dstack._internal.core.models.gateways import GatewayConfiguration, GatewayStatus @@ -241,7 +240,7 @@ def _register_service_in_server(run_model: RunModel, run_spec: RunSpec) -> Servi "Service with SGLang router configuration requires a gateway. " "Please configure a gateway with the SGLang router enabled." ) - if run_spec.configuration.https != SERVICE_HTTPS_DEFAULT: + if run_spec.configuration.https is False: # Note: if the user sets `https: `, it will be ignored silently # TODO: in 0.19, make `https` Optional to be able to tell if it was set or omitted raise ServerClientError( @@ -416,7 +415,14 @@ async def unregister_replica(session: AsyncSession, job_model: JobModel): def _get_service_https(run_spec: RunSpec, configuration: GatewayConfiguration) -> bool: assert run_spec.configuration.type == "service" - if not run_spec.configuration.https: + https = run_spec.configuration.https + if https == "auto": + if configuration.certificate is None: + return False + if configuration.certificate.type == "acm": + return False + return True + if not https: return False if configuration.certificate is not None and configuration.certificate.type == "acm": return False diff --git a/src/tests/_internal/server/services/services/test_services.py b/src/tests/_internal/server/services/services/test_services.py new file mode 100644 index 0000000000..1ee3bf70e4 --- /dev/null +++ b/src/tests/_internal/server/services/services/test_services.py @@ -0,0 +1,100 @@ +from typing import Literal, Optional, Union +from unittest.mock import MagicMock + +import pytest + +from dstack._internal.core.errors import ServerClientError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.gateways import ( + ACMGatewayCertificate, + AnyGatewayCertificate, + GatewayConfiguration, + LetsEncryptGatewayCertificate, +) +from dstack._internal.core.models.runs import RunSpec +from dstack._internal.server.services.services import ( + _get_service_https, + _register_service_in_server, +) +from dstack._internal.server.testing.common import get_run_spec + + +def _service_run_spec(https: Union[bool, Literal["auto"]] = "auto") -> RunSpec: + return get_run_spec( + repo_id="test-repo", + configuration=ServiceConfiguration(commands=["python serve.py"], port=8000, https=https), + ) + + +def _gateway_config( + certificate: Optional[AnyGatewayCertificate] = LetsEncryptGatewayCertificate(), +) -> GatewayConfiguration: + return GatewayConfiguration( + backend=BackendType.AWS, + region="us-east-1", + certificate=certificate, + ) + + +def _mock_run_model() -> MagicMock: + run_model = MagicMock() + run_model.project.name = "test-project" + run_model.run_name = "test-run" + return run_model + + +class TestServiceConfigurationHttps: + def test_default_is_true(self) -> None: + conf = ServiceConfiguration(commands=["python serve.py"], port=8000) + assert conf.https is True + + def test_accepts_auto(self) -> None: + conf = ServiceConfiguration(commands=["python serve.py"], port=8000, https="auto") + assert conf.https == "auto" + + +class TestGetServiceHttps: + def test_auto_resolves_to_true_with_lets_encrypt_gateway(self) -> None: + run_spec = _service_run_spec(https="auto") + gw = _gateway_config(certificate=LetsEncryptGatewayCertificate()) + assert _get_service_https(run_spec, gw) is True + + def test_auto_resolves_to_false_when_gateway_has_no_certificate(self) -> None: + run_spec = _service_run_spec(https="auto") + gw = _gateway_config(certificate=None) + assert _get_service_https(run_spec, gw) is False + + def test_auto_resolves_to_false_with_acm_gateway(self) -> None: + run_spec = _service_run_spec(https="auto") + gw = _gateway_config( + certificate=ACMGatewayCertificate(arn="arn:aws:acm:us-east-1:123:cert/abc") + ) + assert _get_service_https(run_spec, gw) is False + + def test_true_enables_https_regardless_of_gateway_certificate(self) -> None: + run_spec = _service_run_spec(https=True) + gw = _gateway_config(certificate=None) + assert _get_service_https(run_spec, gw) is True + + def test_false_disables_https_regardless_of_gateway_certificate(self) -> None: + run_spec = _service_run_spec(https=False) + gw = _gateway_config(certificate=LetsEncryptGatewayCertificate()) + assert _get_service_https(run_spec, gw) is False + + +class TestRegisterServiceInServerHttps: + def test_allows_default_true_without_gateway(self) -> None: + run_spec = _service_run_spec(https=True) + result = _register_service_in_server(_mock_run_model(), run_spec) + assert result is not None + + def test_allows_auto_without_gateway(self) -> None: + run_spec = _service_run_spec(https="auto") + result = _register_service_in_server(_mock_run_model(), run_spec) + assert result is not None + + def test_rejects_explicit_false_without_gateway(self) -> None: + run_spec = _service_run_spec(https=False) + with pytest.raises(ServerClientError, match="not applicable"): + _register_service_in_server(_mock_run_model(), run_spec) From c9be23a32f9f62b750428428716bcb6ee6ba34a4 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Mon, 23 Feb 2026 12:20:58 +0500 Subject: [PATCH 161/187] Implement gateway pipeline (#3599) * Make pipelines generic over ItemT * Update AGENTS.md * Refactor instance emit events * Implement GatewayPipeline * Refactor * Make gateway deletion async * Add TestGatewayWorkerDeleted * Drop deleted_by_user cols * Handle to_be_deleted gateways in API * Fix delete gateways API tests * Merge migrations * Hint gateway pipeline * Restore sync gateways delete API * Do not run pipelines withou feature flag * Fix server_default in migration * Prevent dstack Sky gateway deletion * Remove implicit gateway_compute load * Remove implicit gateway.backend load --- AGENTS.md | 1 + src/dstack/_internal/server/app.py | 5 +- .../background/pipeline_tasks/__init__.py | 13 +- .../server/background/pipeline_tasks/base.py | 52 +- .../pipeline_tasks/compute_groups.py | 25 +- .../background/pipeline_tasks/gateways.py | 548 ++++++++++++++++++ .../pipeline_tasks/placement_groups.py | 16 +- .../background/scheduled_tasks/__init__.py | 8 +- .../background/scheduled_tasks/gateways.py | 9 +- ...ce_add_gatewaymodel_pipeline_and_to_be_.py | 51 ++ src/dstack/_internal/server/models.py | 9 +- .../_internal/server/routers/gateways.py | 3 + .../server/services/gateways/__init__.py | 211 ++++++- .../_internal/server/services/instances.py | 31 +- .../_internal/server/services/pipelines.py | 20 +- .../server/services/services/__init__.py | 15 +- .../background/pipeline_tasks/test_base.py | 10 +- .../pipeline_tasks/test_gateways.py | 292 ++++++++++ .../pipeline_tasks/test_placement_groups.py | 3 +- .../scheduled_tasks/test_gateways.py | 9 + .../_internal/server/routers/test_gateways.py | 91 ++- 21 files changed, 1334 insertions(+), 88 deletions(-) create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/gateways.py create mode 100644 src/dstack/_internal/server/migrations/versions/2026/02_23_0548_140331002ece_add_gatewaymodel_pipeline_and_to_be_.py create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_gateways.py diff --git a/AGENTS.md b/AGENTS.md index cdf8497e1c..eb348b291b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,6 +17,7 @@ ## Coding Style & Naming Conventions - Python targets 3.9+ with 4-space indentation and max line length of 99 (see `ruff.toml`; `E501` is ignored but keep lines readable). - Imports are sorted via Ruff’s isort settings (`dstack` treated as first-party). +- Keep primary/public functions before local helper functions in a module section. - Prefer pydantic-style models in `core/models`. - Tests use `test_*.py` modules and `test_*` functions; fixtures live near usage. diff --git a/src/dstack/_internal/server/app.py b/src/dstack/_internal/server/app.py index 209679f0ef..03a54ccf2a 100644 --- a/src/dstack/_internal/server/app.py +++ b/src/dstack/_internal/server/app.py @@ -167,8 +167,9 @@ async def lifespan(app: FastAPI): pipeline_manager = None if settings.SERVER_BACKGROUND_PROCESSING_ENABLED: scheduler = start_scheduled_tasks() - pipeline_manager = start_pipeline_tasks() - app.state.pipeline_manager = pipeline_manager + if core_settings.FeatureFlags.PIPELINE_PROCESSING_ENABLED: + pipeline_manager = start_pipeline_tasks() + app.state.pipeline_manager = pipeline_manager else: logger.info("Background processing is disabled") PROBES_SCHEDULER.start() diff --git a/src/dstack/_internal/server/background/pipeline_tasks/__init__.py b/src/dstack/_internal/server/background/pipeline_tasks/__init__.py index 355e042476..01feb958d0 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/__init__.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/__init__.py @@ -2,10 +2,10 @@ from dstack._internal.server.background.pipeline_tasks.base import Pipeline from dstack._internal.server.background.pipeline_tasks.compute_groups import ComputeGroupPipeline +from dstack._internal.server.background.pipeline_tasks.gateways import GatewayPipeline from dstack._internal.server.background.pipeline_tasks.placement_groups import ( PlacementGroupPipeline, ) -from dstack._internal.settings import FeatureFlags from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) @@ -13,12 +13,11 @@ class PipelineManager: def __init__(self) -> None: - self._pipelines: list[Pipeline] = [] - if FeatureFlags.PIPELINE_PROCESSING_ENABLED: - self._pipelines += [ - ComputeGroupPipeline(), - PlacementGroupPipeline(), - ] + self._pipelines: list[Pipeline] = [ + ComputeGroupPipeline(), + GatewayPipeline(), + PlacementGroupPipeline(), + ] self._hinter = PipelineHinter(self._pipelines) def start(self): diff --git a/src/dstack/_internal/server/background/pipeline_tasks/base.py b/src/dstack/_internal/server/background/pipeline_tasks/base.py index 30be480bf9..9d016934cb 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/base.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/base.py @@ -19,6 +19,10 @@ @dataclass class PipelineItem: + """ + Pipelines can work with this class or its subclass if the worker needs to access extra attributes. + """ + __tablename__: str id: uuid.UUID lock_expires_at: datetime @@ -26,7 +30,14 @@ class PipelineItem: prev_lock_expired: bool +ItemT = TypeVar("ItemT", bound=PipelineItem) + + class PipelineModel(Protocol): + """ + Heartbeater can work with any DB model implementing this protocol. + """ + __tablename__: str __mapper__: ClassVar[Any] __table__: ClassVar[Any] @@ -39,7 +50,7 @@ class PipelineError(Exception): pass -class Pipeline(ABC): +class Pipeline(Generic[ItemT], ABC): def __init__( self, workers_num: int, @@ -57,7 +68,7 @@ def __init__( self._min_processing_interval = min_processing_interval self._lock_timeout = lock_timeout self._heartbeat_trigger = heartbeat_trigger - self._queue = asyncio.Queue[PipelineItem](maxsize=self._queue_maxsize) + self._queue = asyncio.Queue[ItemT](maxsize=self._queue_maxsize) self._tasks: list[asyncio.Task] = [] self._running = False self._shutdown = False @@ -119,27 +130,24 @@ def hint_fetch_model_name(self) -> str: @property @abstractmethod - def _heartbeater(self) -> "Heartbeater": + def _heartbeater(self) -> "Heartbeater[ItemT]": pass @property @abstractmethod - def _fetcher(self) -> "Fetcher": + def _fetcher(self) -> "Fetcher[ItemT]": pass @property @abstractmethod - def _workers(self) -> Sequence["Worker"]: + def _workers(self) -> Sequence["Worker[ItemT]"]: pass -ModelT = TypeVar("ModelT", bound=PipelineModel) - - -class Heartbeater(Generic[ModelT]): +class Heartbeater(Generic[ItemT]): def __init__( self, - model_type: type[ModelT], + model_type: type[PipelineModel], lock_timeout: timedelta, heartbeat_trigger: timedelta, heartbeat_delay: float = 1.0, @@ -147,7 +155,7 @@ def __init__( self._model_type = model_type self._lock_timeout = lock_timeout self._hearbeat_margin = heartbeat_trigger - self._items: dict[uuid.UUID, PipelineItem] = {} + self._items: dict[uuid.UUID, ItemT] = {} self._untrack_lock = asyncio.Lock() self._heartbeat_delay = heartbeat_delay self._running = False @@ -164,10 +172,10 @@ async def start(self): def stop(self): self._running = False - async def track(self, item: PipelineItem): + async def track(self, item: ItemT): self._items[item.id] = item - async def untrack(self, item: PipelineItem): + async def untrack(self, item: ItemT): async with self._untrack_lock: tracked = self._items.get(item.id) # Prevent expired fetch iteration to unlock item processed by new iteration. @@ -175,7 +183,7 @@ async def untrack(self, item: PipelineItem): del self._items[item.id] async def heartbeat(self): - items_to_update: list[PipelineItem] = [] + items_to_update: list[ItemT] = [] now = get_current_datetime() items = list(self._items.values()) failed_to_heartbeat_count = 0 @@ -227,16 +235,16 @@ async def heartbeat(self): ) -class Fetcher(ABC): +class Fetcher(Generic[ItemT], ABC): _DEFAULT_FETCH_DELAYS = [0.5, 1, 2, 5] def __init__( self, - queue: asyncio.Queue[PipelineItem], + queue: asyncio.Queue[ItemT], queue_desired_minsize: int, min_processing_interval: timedelta, lock_timeout: timedelta, - heartbeater: Heartbeater, + heartbeater: Heartbeater[ItemT], queue_check_delay: float = 1.0, fetch_delays: Optional[list[float]] = None, ) -> None: @@ -289,7 +297,7 @@ def hint(self): self._fetch_event.set() @abstractmethod - async def fetch(self, limit: int) -> list[PipelineItem]: + async def fetch(self, limit: int) -> list[ItemT]: pass def _next_fetch_delay(self, empty_fetch_count: int) -> float: @@ -298,11 +306,11 @@ def _next_fetch_delay(self, empty_fetch_count: int) -> float: return next_delay * (1 + jitter) -class Worker(ABC): +class Worker(Generic[ItemT], ABC): def __init__( self, - queue: asyncio.Queue[PipelineItem], - heartbeater: Heartbeater, + queue: asyncio.Queue[ItemT], + heartbeater: Heartbeater[ItemT], ) -> None: self._queue = queue self._heartbeater = heartbeater @@ -325,7 +333,7 @@ def stop(self): self._running = False @abstractmethod - async def process(self, item: PipelineItem): + async def process(self, item: ItemT): pass diff --git a/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py b/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py index 685c5205a8..938c6013c8 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py @@ -25,7 +25,7 @@ from dstack._internal.server.models import ComputeGroupModel, InstanceModel, ProjectModel from dstack._internal.server.services import backends as backends_services from dstack._internal.server.services.compute_groups import compute_group_model_to_compute_group -from dstack._internal.server.services.instances import switch_instance_status +from dstack._internal.server.services.instances import emit_instance_status_change_event from dstack._internal.server.services.locking import get_locker from dstack._internal.utils.common import get_current_datetime, run_async from dstack._internal.utils.logging import get_logger @@ -36,7 +36,7 @@ TERMINATION_RETRY_MAX_DURATION = timedelta(minutes=15) -class ComputeGroupPipeline(Pipeline): +class ComputeGroupPipeline(Pipeline[PipelineItem]): def __init__( self, workers_num: int = 10, @@ -54,7 +54,7 @@ def __init__( lock_timeout=lock_timeout, heartbeat_trigger=heartbeat_trigger, ) - self.__heartbeater = Heartbeater[ComputeGroupModel]( + self.__heartbeater = Heartbeater[PipelineItem]( model_type=ComputeGroupModel, lock_timeout=self._lock_timeout, heartbeat_trigger=self._heartbeat_trigger, @@ -76,11 +76,11 @@ def hint_fetch_model_name(self) -> str: return ComputeGroupModel.__name__ @property - def _heartbeater(self) -> Heartbeater: + def _heartbeater(self) -> Heartbeater[PipelineItem]: return self.__heartbeater @property - def _fetcher(self) -> Fetcher: + def _fetcher(self) -> Fetcher[PipelineItem]: return self.__fetcher @property @@ -88,14 +88,14 @@ def _workers(self) -> Sequence["ComputeGroupWorker"]: return self.__workers -class ComputeGroupFetcher(Fetcher): +class ComputeGroupFetcher(Fetcher[PipelineItem]): def __init__( self, queue: asyncio.Queue[PipelineItem], queue_desired_minsize: int, min_processing_interval: timedelta, lock_timeout: timedelta, - heartbeater: Heartbeater[ComputeGroupModel], + heartbeater: Heartbeater[PipelineItem], queue_check_delay: float = 1.0, ) -> None: super().__init__( @@ -161,11 +161,11 @@ async def fetch(self, limit: int) -> list[PipelineItem]: return items -class ComputeGroupWorker(Worker): +class ComputeGroupWorker(Worker[PipelineItem]): def __init__( self, queue: asyncio.Queue[PipelineItem], - heartbeater: Heartbeater[ComputeGroupModel], + heartbeater: Heartbeater[PipelineItem], ) -> None: super().__init__( queue=queue, @@ -235,7 +235,12 @@ async def process(self, item: PipelineItem): .values(**terminate_result.instances_update_map) ) for instance_model in compute_group_model.instances: - switch_instance_status(session, instance_model, InstanceStatus.TERMINATED) + emit_instance_status_change_event( + session=session, + instance_model=instance_model, + old_status=instance_model.status, + new_status=InstanceStatus.TERMINATED, + ) @dataclass diff --git a/src/dstack/_internal/server/background/pipeline_tasks/gateways.py b/src/dstack/_internal/server/background/pipeline_tasks/gateways.py new file mode 100644 index 0000000000..c64cd719a2 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/gateways.py @@ -0,0 +1,548 @@ +import asyncio +import uuid +from dataclasses import dataclass, field +from datetime import timedelta +from typing import Optional, Sequence + +from sqlalchemy import delete, or_, select, update +from sqlalchemy.orm import joinedload, load_only + +from dstack._internal.core.backends.base.compute import ComputeWithGatewaySupport +from dstack._internal.core.errors import BackendError, BackendNotAvailable +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.gateways import GatewayStatus +from dstack._internal.server.background.pipeline_tasks.base import ( + Fetcher, + Heartbeater, + Pipeline, + PipelineItem, + UpdateMap, + Worker, + get_processed_update_map, + get_unlock_update_map, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + BackendModel, + GatewayComputeModel, + GatewayModel, + ProjectModel, +) +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services import events +from dstack._internal.server.services import gateways as gateways_services +from dstack._internal.server.services.gateways import emit_gateway_status_change_event +from dstack._internal.server.services.gateways.pool import gateway_connections_pool +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.logging import fmt +from dstack._internal.utils.common import get_current_datetime, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class GatewayPipelineItem(PipelineItem): + status: GatewayStatus + to_be_deleted: bool + + +class GatewayPipeline(Pipeline[GatewayPipelineItem]): + def __init__( + self, + workers_num: int = 10, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=15), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[GatewayPipelineItem]( + model_type=GatewayModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = GatewayFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + GatewayWorker(queue=self._queue, heartbeater=self._heartbeater) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return GatewayModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[GatewayPipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[GatewayPipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["GatewayWorker"]: + return self.__workers + + +class GatewayFetcher(Fetcher[GatewayPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[GatewayPipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[GatewayPipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + async def fetch(self, limit: int) -> list[GatewayPipelineItem]: + gateway_lock, _ = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__) + async with gateway_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(GatewayModel) + .where( + or_( + GatewayModel.status.in_( + [GatewayStatus.SUBMITTED, GatewayStatus.PROVISIONING] + ), + GatewayModel.to_be_deleted == True, + ), + or_( + GatewayModel.last_processed_at <= now - self._min_processing_interval, + GatewayModel.last_processed_at == GatewayModel.created_at, + ), + or_( + GatewayModel.lock_expires_at.is_(None), + GatewayModel.lock_expires_at < now, + ), + or_( + GatewayModel.lock_owner.is_(None), + GatewayModel.lock_owner == GatewayPipeline.__name__, + ), + ) + .order_by(GatewayModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True) + .options( + load_only( + GatewayModel.id, + GatewayModel.lock_token, + GatewayModel.lock_expires_at, + GatewayModel.status, + GatewayModel.to_be_deleted, + ) + ) + ) + gateway_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for gateway_model in gateway_models: + prev_lock_expired = gateway_model.lock_expires_at is not None + gateway_model.lock_expires_at = lock_expires_at + gateway_model.lock_token = lock_token + gateway_model.lock_owner = GatewayPipeline.__name__ + items.append( + GatewayPipelineItem( + __tablename__=GatewayModel.__tablename__, + id=gateway_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + status=gateway_model.status, + to_be_deleted=gateway_model.to_be_deleted, + ) + ) + await session.commit() + return items + + +class GatewayWorker(Worker[GatewayPipelineItem]): + def __init__( + self, + queue: asyncio.Queue[GatewayPipelineItem], + heartbeater: Heartbeater[GatewayPipelineItem], + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + ) + + async def process(self, item: GatewayPipelineItem): + if item.to_be_deleted: + await _process_to_be_deleted_item(item) + elif item.status == GatewayStatus.SUBMITTED: + await _process_submitted_item(item) + elif item.status == GatewayStatus.PROVISIONING: + await _process_provisioning_item(item) + + +async def _process_submitted_item(item: GatewayPipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(GatewayModel) + .where( + GatewayModel.id == item.id, + GatewayModel.lock_token == item.lock_token, + ) + .options(joinedload(GatewayModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + ) + gateway_model = res.unique().scalar_one_or_none() + if gateway_model is None: + logger.warning( + "Failed to process %s item %s: lock_token mismatch." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + + result = await _process_submitted_gateway(gateway_model) + update_map = result.update_map | get_processed_update_map() | get_unlock_update_map() + async with get_session_ctx() as session: + gateway_compute_model = result.gateway_compute_model + if gateway_compute_model is not None: + session.add(gateway_compute_model) + await session.flush() + update_map["gateway_compute_id"] = gateway_compute_model.id + res = await session.execute( + update(GatewayModel) + .where( + GatewayModel.id == gateway_model.id, + GatewayModel.lock_token == gateway_model.lock_token, + ) + .values(**update_map) + .returning(GatewayModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + logger.warning( + "Failed to update %s item %s after processing: lock_token changed." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + # TODO: Clean up gateway_compute_model. + return + emit_gateway_status_change_event( + session=session, + gateway_model=gateway_model, + old_status=gateway_model.status, + new_status=update_map.get("status", gateway_model.status), + status_message=update_map.get("status_message", gateway_model.status_message), + ) + + +@dataclass +class _SubmittedResult: + update_map: UpdateMap = field(default_factory=dict) + gateway_compute_model: Optional[GatewayComputeModel] = None + + +async def _process_submitted_gateway(gateway_model: GatewayModel) -> _SubmittedResult: + logger.info("%s: started gateway provisioning", fmt(gateway_model)) + configuration = gateways_services.get_gateway_configuration(gateway_model) + try: + ( + backend_model, + backend, + ) = await backends_services.get_project_backend_with_model_by_type_or_error( + project=gateway_model.project, backend_type=configuration.backend + ) + except BackendNotAvailable: + return _SubmittedResult( + update_map={ + "status": GatewayStatus.FAILED, + "status_message": "Backend not available", + } + ) + try: + gateway_compute_model = await gateways_services.create_gateway_compute( + backend_compute=backend.compute(), + project_name=gateway_model.project.name, + configuration=configuration, + backend_id=backend_model.id, + ) + return _SubmittedResult( + update_map={"status": GatewayStatus.PROVISIONING}, + gateway_compute_model=gateway_compute_model, + ) + except BackendError as e: + status_message = f"Backend error: {repr(e)}" + if len(e.args) > 0: + status_message = str(e.args[0]) + return _SubmittedResult( + update_map={ + "status": GatewayStatus.FAILED, + "status_message": status_message, + } + ) + except Exception as e: + logger.exception("%s: got exception when creating gateway compute", fmt(gateway_model)) + return _SubmittedResult( + update_map={ + "status": GatewayStatus.FAILED, + "status_message": f"Unexpected error: {repr(e)}", + } + ) + + +async def _process_provisioning_item(item: GatewayPipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(GatewayModel) + .where( + GatewayModel.id == item.id, + GatewayModel.lock_token == item.lock_token, + ) + .options(joinedload(GatewayModel.gateway_compute)) + ) + gateway_model = res.unique().scalar_one_or_none() + if gateway_model is None: + logger.warning( + "Failed to process %s item %s: lock_token mismatch." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + + result = await _process_provisioning_gateway(gateway_model) + update_map = result.gateway_update_map | get_processed_update_map() | get_unlock_update_map() + async with get_session_ctx() as session: + res = await session.execute( + update(GatewayModel) + .where( + GatewayModel.id == gateway_model.id, + GatewayModel.lock_token == gateway_model.lock_token, + ) + .values(**update_map) + .returning(GatewayModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + logger.warning( + "Failed to update %s item %s after processing: lock_token changed." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + emit_gateway_status_change_event( + session=session, + gateway_model=gateway_model, + old_status=gateway_model.status, + new_status=update_map.get("status", gateway_model.status), + status_message=update_map.get("status_message", gateway_model.status_message), + ) + if result.gateway_compute_update_map: + res = await session.execute( + update(GatewayComputeModel) + .where(GatewayComputeModel.id == gateway_model.gateway_compute_id) + .values(**result.gateway_compute_update_map) + .returning(GatewayComputeModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + logger.error( + "Failed to update compute model %s for gateway %s." + " This is unexpected and may happen only if the compute model was manually deleted.", + gateway_model.id, + item.id, + ) + + +@dataclass +class _ProvisioningResult: + gateway_update_map: UpdateMap = field(default_factory=dict) + gateway_compute_update_map: UpdateMap = field(default_factory=dict) + + +async def _process_provisioning_gateway(gateway_model: GatewayModel) -> _ProvisioningResult: + # Provisioning gateways must have compute. + assert gateway_model.gateway_compute is not None + + # FIXME: problems caused by blocking on connect_to_gateway_with_retry and configure_gateway: + # - cannot delete the gateway before it is provisioned because the DB model is locked + # - connection retry counter is reset on server restart + # - only one server replica is processing the gateway + # Easy to fix by doing only one connection/configuration attempt per processing iteration. The + # main challenge is applying the same provisioning model to the dstack Sky gateway to avoid + # maintaining a different model for Sky. + connection = await gateways_services.connect_to_gateway_with_retry( + gateway_model.gateway_compute + ) + if connection is None: + return _ProvisioningResult( + gateway_update_map={ + "status": GatewayStatus.FAILED, + "status_message": "Failed to connect to gateway", + }, + gateway_compute_update_map={"active": False}, + ) + try: + await gateways_services.configure_gateway(connection) + except Exception: + logger.exception("%s: failed to configure gateway", fmt(gateway_model)) + await gateway_connections_pool.remove(gateway_model.gateway_compute.ip_address) + return _ProvisioningResult( + gateway_update_map={ + "status": GatewayStatus.FAILED, + "status_message": "Failed to configure gateway", + }, + gateway_compute_update_map={"active": False}, + ) + return _ProvisioningResult( + gateway_update_map={"status": GatewayStatus.RUNNING}, + ) + + +async def _process_to_be_deleted_item(item: GatewayPipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(GatewayModel) + .where( + GatewayModel.id == item.id, + GatewayModel.lock_token == item.lock_token, + ) + .options(joinedload(GatewayModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(GatewayModel.gateway_compute)) + .options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + ) + gateway_model = res.unique().scalar_one_or_none() + if gateway_model is None: + logger.warning( + "Failed to process %s item %s: lock_token mismatch." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + + result = await _process_to_be_deleted_gateway(gateway_model) + async with get_session_ctx() as session: + if result.delete_gateway: + res = await session.execute( + delete(GatewayModel) + .where( + GatewayModel.id == gateway_model.id, + GatewayModel.lock_token == gateway_model.lock_token, + ) + .returning(GatewayModel.id) + ) + deleted_ids = list(res.scalars().all()) + if len(deleted_ids) == 0: + logger.warning( + "Failed to delete %s item %s after processing: lock_token changed." + " The item is expected to be processed and deleted on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + events.emit( + session, + "Gateway deleted", + actor=events.SystemActor(), + targets=[events.Target.from_model(gateway_model)], + ) + else: + res = await session.execute( + update(GatewayModel) + .where( + GatewayModel.id == gateway_model.id, + GatewayModel.lock_token == gateway_model.lock_token, + ) + .values(**get_processed_update_map()) + .returning(GatewayModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + logger.warning( + "Failed to update %s item %s after processing: lock_token changed." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + + if result.gateway_compute_update_map: + res = await session.execute( + update(GatewayComputeModel) + .where(GatewayComputeModel.id == gateway_model.gateway_compute_id) + .values(**result.gateway_compute_update_map) + .returning(GatewayComputeModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + logger.error( + "Failed to update compute model %s for gateway %s." + " This is unexpected and may happen only if the compute model was manually deleted.", + gateway_model.id, + item.id, + ) + return + + +@dataclass +class _DeletedResult: + delete_gateway: bool + gateway_compute_update_map: UpdateMap = field(default_factory=dict) + + +async def _process_to_be_deleted_gateway(gateway_model: GatewayModel) -> _DeletedResult: + assert gateway_model.backend.type != BackendType.DSTACK + backend = await backends_services.get_project_backend_by_type_or_error( + project=gateway_model.project, backend_type=gateway_model.backend.type + ) + compute = backend.compute() + assert isinstance(compute, ComputeWithGatewaySupport) + gateway_compute_configuration = gateways_services.get_gateway_compute_configuration( + gateway_model + ) + if gateway_model.gateway_compute is not None and gateway_compute_configuration is not None: + logger.info("Deleting gateway compute for %s...", gateway_model.name) + try: + await run_async( + compute.terminate_gateway, + gateway_model.gateway_compute.instance_id, + gateway_compute_configuration, + gateway_model.gateway_compute.backend_data, + ) + except Exception: + logger.exception( + "Error when deleting gateway compute for %s", + gateway_model.name, + ) + return _DeletedResult(delete_gateway=False) + logger.info("Deleted gateway compute for %s", gateway_model.name) + result = _DeletedResult(delete_gateway=True) + if gateway_model.gateway_compute is not None: + await gateway_connections_pool.remove(gateway_model.gateway_compute.ip_address) + result.gateway_compute_update_map = {"active": False, "deleted": True} + return result diff --git a/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py b/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py index 9fac5665a5..a184379c37 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py @@ -32,7 +32,7 @@ logger = get_logger(__name__) -class PlacementGroupPipeline(Pipeline): +class PlacementGroupPipeline(Pipeline[PipelineItem]): def __init__( self, workers_num: int = 10, @@ -50,7 +50,7 @@ def __init__( lock_timeout=lock_timeout, heartbeat_trigger=heartbeat_trigger, ) - self.__heartbeater = Heartbeater[PlacementGroupModel]( + self.__heartbeater = Heartbeater[PipelineItem]( model_type=PlacementGroupModel, lock_timeout=self._lock_timeout, heartbeat_trigger=self._heartbeat_trigger, @@ -72,11 +72,11 @@ def hint_fetch_model_name(self) -> str: return PlacementGroupModel.__name__ @property - def _heartbeater(self) -> Heartbeater: + def _heartbeater(self) -> Heartbeater[PipelineItem]: return self.__heartbeater @property - def _fetcher(self) -> Fetcher: + def _fetcher(self) -> Fetcher[PipelineItem]: return self.__fetcher @property @@ -84,14 +84,14 @@ def _workers(self) -> Sequence["PlacementGroupWorker"]: return self.__workers -class PlacementGroupFetcher(Fetcher): +class PlacementGroupFetcher(Fetcher[PipelineItem]): def __init__( self, queue: asyncio.Queue[PipelineItem], queue_desired_minsize: int, min_processing_interval: timedelta, lock_timeout: timedelta, - heartbeater: Heartbeater[PlacementGroupModel], + heartbeater: Heartbeater[PipelineItem], queue_check_delay: float = 1.0, ) -> None: super().__init__( @@ -159,11 +159,11 @@ async def fetch(self, limit: int) -> list[PipelineItem]: return items -class PlacementGroupWorker(Worker): +class PlacementGroupWorker(Worker[PipelineItem]): def __init__( self, queue: asyncio.Queue[PipelineItem], - heartbeater: Heartbeater[PlacementGroupModel], + heartbeater: Heartbeater[PipelineItem], ) -> None: super().__init__( queue=queue, diff --git a/src/dstack/_internal/server/background/scheduled_tasks/__init__.py b/src/dstack/_internal/server/background/scheduled_tasks/__init__.py index c4baf96c58..6067d9d4de 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/__init__.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/__init__.py @@ -99,21 +99,23 @@ def start_scheduled_tasks() -> AsyncIOScheduler: ) _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1) _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15)) - _scheduler.add_job(process_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5) _scheduler.add_job( process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5 ) _scheduler.add_job( process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1 ) - if not FeatureFlags.PIPELINE_PROCESSING_ENABLED: - _scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5)) _scheduler.add_job( process_fleets, IntervalTrigger(seconds=10, jitter=2), max_instances=1, ) _scheduler.add_job(delete_instance_health_checks, IntervalTrigger(minutes=5), max_instances=1) + if not FeatureFlags.PIPELINE_PROCESSING_ENABLED: + _scheduler.add_job( + process_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5 + ) + _scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5)) for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR): # Add multiple copies of tasks if requested. # max_instances=1 for additional copies to avoid running too many tasks. diff --git a/src/dstack/_internal/server/background/scheduled_tasks/gateways.py b/src/dstack/_internal/server/background/scheduled_tasks/gateways.py index 2566a4f4d8..3b6bee012e 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/gateways.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/gateways.py @@ -7,7 +7,12 @@ from dstack._internal.core.errors import BackendError, BackendNotAvailable, SSHError from dstack._internal.core.models.gateways import GatewayStatus from dstack._internal.server.db import get_db, get_session_ctx -from dstack._internal.server.models import GatewayComputeModel, GatewayModel, ProjectModel +from dstack._internal.server.models import ( + BackendModel, + GatewayComputeModel, + GatewayModel, + ProjectModel, +) from dstack._internal.server.services import backends as backends_services from dstack._internal.server.services import gateways as gateways_services from dstack._internal.server.services.gateways import ( @@ -109,6 +114,7 @@ async def _process_submitted_gateway(session: AsyncSession, gateway_model: Gatew select(GatewayModel) .where(GatewayModel.id == gateway_model.id) .options(joinedload(GatewayModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) .execution_options(populate_existing=True) ) gateway_model = res.unique().scalar_one() @@ -153,6 +159,7 @@ async def _process_provisioning_gateway( res = await session.execute( select(GatewayModel) .where(GatewayModel.id == gateway_model.id) + .options(joinedload(GatewayModel.gateway_compute)) .execution_options(populate_existing=True) ) gateway_model = res.unique().scalar_one() diff --git a/src/dstack/_internal/server/migrations/versions/2026/02_23_0548_140331002ece_add_gatewaymodel_pipeline_and_to_be_.py b/src/dstack/_internal/server/migrations/versions/2026/02_23_0548_140331002ece_add_gatewaymodel_pipeline_and_to_be_.py new file mode 100644 index 0000000000..fa3c8ce30c --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/02_23_0548_140331002ece_add_gatewaymodel_pipeline_and_to_be_.py @@ -0,0 +1,51 @@ +"""Add GatewayModel pipeline and to_be_deleted columns + +Revision ID: 140331002ece +Revises: a8ed24fd7f90 +Create Date: 2026-02-23 05:48:55.948838+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "140331002ece" +down_revision = "a8ed24fd7f90" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("gateways", schema=None) as batch_op: + batch_op.add_column( + sa.Column("to_be_deleted", sa.Boolean(), server_default=sa.false(), nullable=False) + ) + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("gateways", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + batch_op.drop_column("to_be_deleted") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index a837137a10..df9cf86078 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -492,7 +492,7 @@ class JobModel(BaseModel): waiting_master_job: Mapped[Optional[bool]] = mapped_column(Boolean) -class GatewayModel(BaseModel): +class GatewayModel(PipelineModelMixin, BaseModel): __tablename__ = "gateways" id: Mapped[uuid.UUID] = mapped_column( @@ -508,21 +508,24 @@ class GatewayModel(BaseModel): status: Mapped[GatewayStatus] = mapped_column(EnumAsString(GatewayStatus, 100)) status_message: Mapped[Optional[str]] = mapped_column(Text) last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime) + to_be_deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) project: Mapped["ProjectModel"] = relationship(foreign_keys=[project_id]) backend_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("backends.id", ondelete="CASCADE")) - backend: Mapped["BackendModel"] = relationship(lazy="selectin") + backend: Mapped["BackendModel"] = relationship() gateway_compute_id: Mapped[Optional[uuid.UUID]] = mapped_column( ForeignKey("gateway_computes.id", ondelete="CASCADE") ) - gateway_compute: Mapped[Optional["GatewayComputeModel"]] = relationship(lazy="joined") + gateway_compute: Mapped[Optional["GatewayComputeModel"]] = relationship() runs: Mapped[List["RunModel"]] = relationship(back_populates="gateway") __table_args__ = (UniqueConstraint("project_id", "name", name="uq_gateways_project_id_name"),) + # TODO: Add pipeline index ("ix_gateways_pipeline_fetch_q") if gateways become soft-deleted. + class GatewayComputeModel(BaseModel): __tablename__ = "gateway_computes" diff --git a/src/dstack/_internal/server/routers/gateways.py b/src/dstack/_internal/server/routers/gateways.py index 0f89e5db45..af4557a449 100644 --- a/src/dstack/_internal/server/routers/gateways.py +++ b/src/dstack/_internal/server/routers/gateways.py @@ -13,6 +13,7 @@ ProjectAdmin, ProjectMemberOrPublicAccess, ) +from dstack._internal.server.services.pipelines import PipelineHinterProtocol, get_pipeline_hinter from dstack._internal.server.utils.routers import ( CustomORJSONResponse, get_base_api_additional_responses, @@ -54,6 +55,7 @@ async def create_gateway( body: schemas.CreateGatewayRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectAdmin()), + pipeline_hinter: PipelineHinterProtocol = Depends(get_pipeline_hinter), ): user, project = user_project return CustomORJSONResponse( @@ -62,6 +64,7 @@ async def create_gateway( user=user, project=project, configuration=body.configuration, + pipeline_hinter=pipeline_hinter, ) ) diff --git a/src/dstack/_internal/server/services/gateways/__init__.py b/src/dstack/_internal/server/services/gateways/__init__.py index ab89c2a7c8..762af8bef1 100644 --- a/src/dstack/_internal/server/services/gateways/__init__.py +++ b/src/dstack/_internal/server/services/gateways/__init__.py @@ -10,7 +10,7 @@ import httpx from sqlalchemy import func, select, update from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import selectinload +from sqlalchemy.orm import joinedload import dstack._internal.utils.random_names as random_names from dstack._internal.core.backends.base.compute import ( @@ -42,6 +42,7 @@ from dstack._internal.server import settings from dstack._internal.server.db import get_db, is_db_postgres, is_db_sqlite from dstack._internal.server.models import ( + BackendModel, GatewayComputeModel, GatewayModel, ProjectModel, @@ -60,8 +61,10 @@ get_locker, string_to_lock_id, ) +from dstack._internal.server.services.pipelines import PipelineHinterProtocol from dstack._internal.server.services.plugins import apply_plugin_policies from dstack._internal.server.utils.common import gather_map_async +from dstack._internal.settings import FeatureFlags from dstack._internal.utils.common import get_current_datetime, run_async from dstack._internal.utils.crypto import generate_rsa_key_pair_bytes from dstack._internal.utils.logging import get_logger @@ -80,13 +83,43 @@ def switch_gateway_status( return gateway_model.status = new_status + emit_gateway_status_change_event( + session=session, + gateway_model=gateway_model, + old_status=old_status, + new_status=new_status, + status_message=gateway_model.status_message, + actor=actor, + ) - msg = f"Gateway status changed {old_status.upper()} -> {new_status.upper()}" - if gateway_model.status_message is not None: - msg += f" ({gateway_model.status_message})" + +def emit_gateway_status_change_event( + session: AsyncSession, + gateway_model: GatewayModel, + old_status: GatewayStatus, + new_status: GatewayStatus, + status_message: Optional[str], + actor: events.AnyActor = events.SystemActor(), +) -> None: + if old_status == new_status: + return + msg = get_gateway_status_change_message( + old_status=old_status, + new_status=new_status, + status_message=status_message, + ) events.emit(session, msg, actor=actor, targets=[events.Target.from_model(gateway_model)]) +def get_gateway_status_change_message( + old_status: GatewayStatus, new_status: GatewayStatus, status_message: Optional[str] +) -> str: + msg = f"Gateway status changed {old_status.upper()} -> {new_status.upper()}" + if status_message is not None: + msg += f" ({status_message})" + return msg + + GATEWAY_CONNECT_ATTEMPTS = 30 GATEWAY_CONNECT_DELAY = 10 GATEWAY_CONFIGURE_ATTEMPTS = 50 @@ -94,14 +127,25 @@ def switch_gateway_status( async def list_project_gateways(session: AsyncSession, project: ProjectModel) -> List[Gateway]: - gateways = await list_project_gateway_models(session=session, project=project) + gateways = await list_project_gateway_models( + session=session, + project=project, + load_gateway_compute=True, + load_backend_type=True, + ) return [gateway_model_to_gateway(g) for g in gateways] async def get_gateway_by_name( session: AsyncSession, project: ProjectModel, name: str ) -> Optional[Gateway]: - gateway = await get_project_gateway_model_by_name(session=session, project=project, name=name) + gateway = await get_project_gateway_model_by_name( + session=session, + project=project, + name=name, + load_gateway_compute=True, + load_backend_type=True, + ) if gateway is None: return None return gateway_model_to_gateway(gateway) @@ -156,6 +200,7 @@ async def create_gateway( user: UserModel, project: ProjectModel, configuration: GatewayConfiguration, + pipeline_hinter: PipelineHinterProtocol, ) -> Gateway: spec = await apply_plugin_policies( user=user.name, @@ -183,6 +228,7 @@ async def create_gateway( if configuration.name is None: configuration.name = await generate_gateway_name(session=session, project=project) + now = get_current_datetime() gateway = GatewayModel( id=uuid.uuid4(), name=configuration.name, @@ -192,7 +238,8 @@ async def create_gateway( wildcard_domain=configuration.domain, configuration=configuration.json(), status=GatewayStatus.SUBMITTED, - last_processed_at=get_current_datetime(), + created_at=now, + last_processed_at=now, ) session.add(gateway) events.emit( @@ -208,6 +255,15 @@ async def create_gateway( await set_default_gateway( session=session, project=project, name=configuration.name, user=user ) + pipeline_hinter.hint_fetch(GatewayModel.__name__) + gateway = await get_project_gateway_model_by_name( + session=session, + project=project, + name=configuration.name, + load_gateway_compute=True, + load_backend_type=True, + ) + assert gateway is not None return gateway_model_to_gateway(gateway) @@ -245,6 +301,86 @@ async def delete_gateways( project: ProjectModel, gateways_names: List[str], user: UserModel, +): + # Keep both delete code paths while pipeline processing is behind a feature flag: + # - pipeline path marks gateways for async deletion by GatewayPipeline + # - sync path deletes gateway resources inline for non-pipeline processing + # TODO: Drop sync path after pipeline processing is enabled by default. + if FeatureFlags.PIPELINE_PROCESSING_ENABLED: + await _delete_gateways_pipeline( + session=session, + project=project, + gateways_names=gateways_names, + user=user, + ) + else: + await _delete_gateways_sync( + session=session, + project=project, + gateways_names=gateways_names, + user=user, + ) + + +async def _delete_gateways_pipeline( + session: AsyncSession, + project: ProjectModel, + gateways_names: List[str], + user: UserModel, +): + res = await session.execute( + select(GatewayModel).where( + GatewayModel.project_id == project.id, + GatewayModel.name.in_(gateways_names), + ) + ) + gateway_models = res.scalars().all() + gateways_ids = sorted([g.id for g in gateway_models]) + await session.commit() + logger.info("Deleting gateways: %s", [g.name for g in gateway_models]) + async with get_locker(get_db().dialect_name).lock_ctx( + GatewayModel.__tablename__, gateways_ids + ): + # Refetch after lock + res = await session.execute( + select(GatewayModel) + .where( + GatewayModel.id.in_(gateways_ids), + GatewayModel.project_id == project.id, + GatewayModel.lock_expires_at.is_(None), + ) + .options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + .order_by(GatewayModel.id) # take locks in order + .with_for_update(key_share=True, nowait=True, of=GatewayModel) + .execution_options(populate_existing=True) + ) + gateway_models = res.scalars().all() + if len(gateway_models) != len(gateways_ids): + # TODO: Make the delete endpoint fully async so we don't need to lock and error: + # put the request in queue and process in the background. + raise ServerClientError( + "Failed to delete gateways: gateways are being processed currently. Try again later." + ) + for gateway_model in gateway_models: + if gateway_model.backend.type == BackendType.DSTACK: + raise ServerClientError("Cannot delete dstack Sky gateway") + for gateway_model in gateway_models: + if not gateway_model.to_be_deleted: + gateway_model.to_be_deleted = True + events.emit( + session, + "Gateway marked for deletion", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(gateway_model)], + ) + await session.commit() + + +async def _delete_gateways_sync( + session: AsyncSession, + project: ProjectModel, + gateways_names: List[str], + user: UserModel, ): res = await session.execute( select(GatewayModel).where( @@ -266,10 +402,11 @@ async def delete_gateways( GatewayModel.project_id == project.id, GatewayModel.name.in_(gateways_names), ) - .options(selectinload(GatewayModel.gateway_compute)) + .options(joinedload(GatewayModel.gateway_compute)) + .options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) .execution_options(populate_existing=True) .order_by(GatewayModel.id) # take locks in order - .with_for_update(key_share=True) + .with_for_update(key_share=True, of=GatewayModel) ) gateway_models = res.scalars().all() for gateway_model in gateway_models: @@ -346,6 +483,8 @@ async def set_default_gateway( gateway = await get_project_gateway_model_by_name(session=session, project=project, name=name) if gateway is None: raise ResourceNotExistsError() + if gateway.to_be_deleted: + raise ServerClientError("Cannot set gateway marked for deletion as default") if project.default_gateway_id == gateway.id: return previous_gateway = await get_project_default_gateway_model(session, project) @@ -375,20 +514,36 @@ async def set_default_gateway( async def list_project_gateway_models( - session: AsyncSession, project: ProjectModel + session: AsyncSession, + project: ProjectModel, + load_gateway_compute: bool = False, + load_backend_type: bool = False, ) -> Sequence[GatewayModel]: - res = await session.execute(select(GatewayModel).where(GatewayModel.project_id == project.id)) + stmt = select(GatewayModel).where(GatewayModel.project_id == project.id) + if load_gateway_compute: + stmt = stmt.options(joinedload(GatewayModel.gateway_compute)) + if load_backend_type: + stmt = stmt.options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + res = await session.execute(stmt) return res.scalars().all() async def get_project_gateway_model_by_name( - session: AsyncSession, project: ProjectModel, name: str + session: AsyncSession, + project: ProjectModel, + name: str, + load_gateway_compute: bool = False, + load_backend_type: bool = False, ) -> Optional[GatewayModel]: - res = await session.execute( - select(GatewayModel).where( - GatewayModel.project_id == project.id, GatewayModel.name == name - ) + stmt = select(GatewayModel).where( + GatewayModel.project_id == project.id, + GatewayModel.name == name, ) + if load_gateway_compute: + stmt = stmt.options(joinedload(GatewayModel.gateway_compute)) + if load_backend_type: + stmt = stmt.options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + res = await session.execute(stmt) return res.scalar() @@ -419,17 +574,28 @@ async def get_project_gateway_model_by_name_for_update( res = await session.execute( select(GatewayModel) .where(GatewayModel.id.in_([gateway_id]), *filters) + .options(joinedload(GatewayModel.gateway_compute)) + .options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) .with_for_update(key_share=True, of=GatewayModel) ) yield res.scalar_one_or_none() async def get_project_default_gateway_model( - session: AsyncSession, project: ProjectModel + session: AsyncSession, + project: ProjectModel, + load_gateway_compute: bool = False, + load_backend_type: bool = False, ) -> Optional[GatewayModel]: - res = await session.execute( - select(GatewayModel).where(GatewayModel.id == project.default_gateway_id) + stmt = select(GatewayModel).where( + GatewayModel.id == project.default_gateway_id, + GatewayModel.to_be_deleted == False, ) + if load_gateway_compute: + stmt = stmt.options(joinedload(GatewayModel.gateway_compute)) + if load_backend_type: + stmt = stmt.options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + res = await session.execute(stmt) return res.scalar_one_or_none() @@ -445,7 +611,12 @@ async def generate_gateway_name(session: AsyncSession, project: ProjectModel) -> async def get_or_add_gateway_connection( session: AsyncSession, gateway_id: uuid.UUID ) -> tuple[GatewayModel, GatewayConnection]: - gateway = await session.get(GatewayModel, gateway_id) + gateway = await session.get( + GatewayModel, + gateway_id, + options=[joinedload(GatewayModel.gateway_compute)], + populate_existing=True, + ) if gateway is None: raise GatewayError("Gateway not found") if gateway.gateway_compute is None: diff --git a/src/dstack/_internal/server/services/instances.py b/src/dstack/_internal/server/services/instances.py index 8506ad2731..f37e1c9682 100644 --- a/src/dstack/_internal/server/services/instances.py +++ b/src/dstack/_internal/server/services/instances.py @@ -82,9 +82,36 @@ def switch_instance_status( old_status = instance_model.status if old_status == new_status: return - instance_model.status = new_status + emit_instance_status_change_event( + session=session, + instance_model=instance_model, + old_status=old_status, + new_status=new_status, + actor=actor, + ) + +def emit_instance_status_change_event( + session: AsyncSession, + instance_model: InstanceModel, + old_status: InstanceStatus, + new_status: InstanceStatus, + actor: events.AnyActor = events.SystemActor(), +) -> None: + if old_status == new_status: + return + msg = get_instance_status_change_message( + instance_model=instance_model, + old_status=old_status, + new_status=new_status, + ) + events.emit(session, msg, actor=actor, targets=[events.Target.from_model(instance_model)]) + + +def get_instance_status_change_message( + instance_model: InstanceModel, old_status: InstanceStatus, new_status: InstanceStatus +) -> str: msg = f"Instance status changed {old_status.upper()} -> {new_status.upper()}" if ( new_status == InstanceStatus.TERMINATING @@ -105,7 +132,7 @@ def switch_instance_status( msg += f". Termination reason: {instance_model.termination_reason.upper()}" if instance_model.termination_reason_message: msg += f" ({instance_model.termination_reason_message})" - events.emit(session, msg, actor=actor, targets=[events.Target.from_model(instance_model)]) + return msg def format_instance_blocks_for_event(instance_model: InstanceModel) -> str: diff --git a/src/dstack/_internal/server/services/pipelines.py b/src/dstack/_internal/server/services/pipelines.py index 19f4df902d..cbe2a28742 100644 --- a/src/dstack/_internal/server/services/pipelines.py +++ b/src/dstack/_internal/server/services/pipelines.py @@ -5,8 +5,26 @@ class PipelineHinterProtocol(Protocol): def hint_fetch(self, model_name: str) -> None: + """ + Pass `Model.__name__` to hint replica's pipelines to fetch the model's items ASAP. + """ pass +class _NoopPipelineHinter: + def hint_fetch(self, model_name: str) -> None: + pass + + +_noop_pipeline_hinter = _NoopPipelineHinter() + + def get_pipeline_hinter(request: Request) -> PipelineHinterProtocol: - return request.app.state.pipeline_manager.hinter + """ + Returns pipeline hinter that allows hinting replica's pipelines that there are new items for processing. + This can reduce processing latency if the processing happens rarely. + """ + pipeline_manager = getattr(request.app.state, "pipeline_manager", None) + if pipeline_manager is None: + return _noop_pipeline_hinter + return pipeline_manager.hinter diff --git a/src/dstack/_internal/server/services/services/__init__.py b/src/dstack/_internal/server/services/services/__init__.py index 8f0849050f..3846f5d6a8 100644 --- a/src/dstack/_internal/server/services/services/__init__.py +++ b/src/dstack/_internal/server/services/services/__init__.py @@ -91,17 +91,28 @@ async def register_service(session: AsyncSession, run_model: RunModel, run_spec: if isinstance(run_spec.configuration.gateway, str): gateway = await get_project_gateway_model_by_name( - session=session, project=run_model.project, name=run_spec.configuration.gateway + session=session, + project=run_model.project, + name=run_spec.configuration.gateway, + load_gateway_compute=True, + load_backend_type=True, ) if gateway is None: raise ResourceNotExistsError( f"Gateway {run_spec.configuration.gateway} does not exist" ) + if gateway.to_be_deleted: + raise ResourceNotExistsError( + f"Gateway {run_spec.configuration.gateway} was marked for deletion" + ) elif run_spec.configuration.gateway == False: gateway = None else: gateway = await get_project_default_gateway_model( - session=session, project=run_model.project + session=session, + project=run_model.project, + load_gateway_compute=True, + load_backend_type=True, ) if gateway is None and run_spec.configuration.gateway == True: raise ResourceNotExistsError( diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_base.py b/src/tests/_internal/server/background/pipeline_tasks/test_base.py index 7e84d9f80d..303fb0854e 100644 --- a/src/tests/_internal/server/background/pipeline_tasks/test_base.py +++ b/src/tests/_internal/server/background/pipeline_tasks/test_base.py @@ -21,7 +21,7 @@ def now() -> datetime: @pytest.fixture -def heartbeater() -> Heartbeater[PlacementGroupModel]: +def heartbeater() -> Heartbeater[PipelineItem]: return Heartbeater( model_type=PlacementGroupModel, lock_timeout=timedelta(seconds=30), @@ -63,7 +63,7 @@ def _placement_group_to_pipeline_item(placement_group: PlacementGroupModel) -> P class TestHeartbeater: @pytest.mark.asyncio async def test_untrack_preserves_item_when_lock_token_mismatches( - self, heartbeater: Heartbeater[PlacementGroupModel], now: datetime + self, heartbeater: Heartbeater[PipelineItem], now: datetime ): item = PipelineItem( __tablename__=PlacementGroupModel.__tablename__, @@ -93,7 +93,7 @@ async def test_heartbeat_extends_locks_close_to_expiration( self, test_db, session: AsyncSession, - heartbeater: Heartbeater[PlacementGroupModel], + heartbeater: Heartbeater[PipelineItem], now: datetime, ): placement_group = await _create_locked_placement_group( @@ -122,7 +122,7 @@ async def test_heartbeat_untracks_expired_items_without_db_update( self, test_db, session: AsyncSession, - heartbeater: Heartbeater[PlacementGroupModel], + heartbeater: Heartbeater[PipelineItem], now: datetime, ): original_lock_expires_at = now - timedelta(seconds=1) @@ -150,7 +150,7 @@ async def test_heartbeat_untracks_item_when_lock_token_changed_in_db( self, test_db, session: AsyncSession, - heartbeater: Heartbeater[PlacementGroupModel], + heartbeater: Heartbeater[PipelineItem], now: datetime, ): original_lock_expires_at = now + timedelta(seconds=2) diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py b/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py new file mode 100644 index 0000000000..9628451bdc --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py @@ -0,0 +1,292 @@ +import uuid +from datetime import datetime, timezone +from unittest.mock import MagicMock, Mock, patch + +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload + +from dstack._internal.core.errors import BackendError +from dstack._internal.core.models.gateways import GatewayProvisioningData, GatewayStatus +from dstack._internal.server.background.pipeline_tasks.gateways import ( + GatewayPipelineItem, + GatewayWorker, +) +from dstack._internal.server.models import GatewayModel +from dstack._internal.server.testing.common import ( + AsyncContextManager, + ComputeMockSpec, + create_backend, + create_gateway, + create_gateway_compute, + create_project, + list_events, +) + + +@pytest.fixture +def worker() -> GatewayWorker: + return GatewayWorker(queue=Mock(), heartbeater=Mock()) + + +def _gateway_to_pipeline_item(gateway_model: GatewayModel) -> GatewayPipelineItem: + assert gateway_model.lock_token is not None + assert gateway_model.lock_expires_at is not None + return GatewayPipelineItem( + __tablename__=gateway_model.__tablename__, + id=gateway_model.id, + lock_token=gateway_model.lock_token, + lock_expires_at=gateway_model.lock_expires_at, + prev_lock_expired=False, + status=gateway_model.status, + to_be_deleted=gateway_model.to_be_deleted, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestGatewayWorkerSubmitted: + async def test_submitted_to_provisioning( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.SUBMITTED, + ) + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.services.backends.get_project_backend_with_model_by_type_or_error" + ) as m: + aws = Mock() + m.return_value = (backend, aws) + aws.compute.return_value = Mock(spec=ComputeMockSpec) + aws.compute.return_value.create_gateway.return_value = GatewayProvisioningData( + instance_id="i-1234567890", + ip_address="2.2.2.2", + region="us", + ) + await worker.process(_gateway_to_pipeline_item(gateway)) + m.assert_called_once() + aws.compute.return_value.create_gateway.assert_called_once() + + await session.refresh(gateway) + res = await session.execute( + select(GatewayModel) + .where(GatewayModel.id == gateway.id) + .options(joinedload(GatewayModel.gateway_compute)) + ) + gateway = res.unique().scalar_one() + assert gateway.status == GatewayStatus.PROVISIONING + assert gateway.gateway_compute is not None + assert gateway.gateway_compute.ip_address == "2.2.2.2" + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway status changed SUBMITTED -> PROVISIONING" + + async def test_marks_gateway_as_failed_if_gateway_creation_errors( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + status=GatewayStatus.SUBMITTED, + ) + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.services.backends.get_project_backend_with_model_by_type_or_error" + ) as m: + aws = Mock() + m.return_value = (backend, aws) + aws.compute.return_value = Mock(spec=ComputeMockSpec) + aws.compute.return_value.create_gateway.side_effect = BackendError("Some error") + await worker.process(_gateway_to_pipeline_item(gateway)) + m.assert_called_once() + aws.compute.return_value.create_gateway.assert_called_once() + + await session.refresh(gateway) + assert gateway.status == GatewayStatus.FAILED + assert gateway.status_message == "Some error" + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway status changed SUBMITTED -> FAILED (Some error)" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestGatewayWorkerProvisioning: + async def test_provisioning_to_running( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway_compute = await create_gateway_compute(session) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + gateway_compute_id=gateway_compute.id, + status=GatewayStatus.PROVISIONING, + ) + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.services.gateways.gateway_connections_pool.get_or_add" + ) as pool_add: + pool_add.return_value = MagicMock() + pool_add.return_value.client.return_value = MagicMock(AsyncContextManager()) + await worker.process(_gateway_to_pipeline_item(gateway)) + pool_add.assert_called_once() + + await session.refresh(gateway) + assert gateway.status == GatewayStatus.RUNNING + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway status changed PROVISIONING -> RUNNING" + + async def test_marks_gateway_as_failed_if_fails_to_connect( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway_compute = await create_gateway_compute(session) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + gateway_compute_id=gateway_compute.id, + status=GatewayStatus.PROVISIONING, + ) + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.services.gateways.connect_to_gateway_with_retry" + ) as connect_to_gateway_with_retry_mock: + connect_to_gateway_with_retry_mock.return_value = None + await worker.process(_gateway_to_pipeline_item(gateway)) + connect_to_gateway_with_retry_mock.assert_called_once() + + await session.refresh(gateway) + assert gateway.status == GatewayStatus.FAILED + assert gateway.status_message == "Failed to connect to gateway" + events = await list_events(session) + assert len(events) == 1 + assert ( + events[0].message + == "Gateway status changed PROVISIONING -> FAILED (Failed to connect to gateway)" + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestGatewayWorkerDeleted: + async def test_deletes_gateway_and_marks_compute_deleted( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway_compute = await create_gateway_compute(session=session, backend_id=backend.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + gateway_compute_id=gateway_compute.id, + status=GatewayStatus.RUNNING, + ) + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + gateway.to_be_deleted = True + await session.commit() + + with ( + patch( + "dstack._internal.server.services.backends.get_project_backend_by_type_or_error" + ) as get_backend_mock, + patch( + "dstack._internal.server.background.pipeline_tasks.gateways.gateway_connections_pool.remove" + ) as remove_connection_mock, + ): + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + get_backend_mock.return_value = backend_mock + + await worker.process(_gateway_to_pipeline_item(gateway)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.terminate_gateway.assert_called_once() + remove_connection_mock.assert_called_once_with(gateway_compute.ip_address) + + await session.refresh(gateway_compute) + res = await session.execute(select(GatewayModel.id).where(GatewayModel.id == gateway.id)) + assert res.scalar_one_or_none() is None + assert gateway_compute.active is False + assert gateway_compute.deleted is True + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Gateway deleted" + + async def test_keeps_gateway_if_terminate_fails( + self, test_db, session: AsyncSession, worker: GatewayWorker + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + gateway_compute = await create_gateway_compute(session=session, backend_id=backend.id) + gateway = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + gateway_compute_id=gateway_compute.id, + status=GatewayStatus.RUNNING, + ) + gateway.lock_token = uuid.uuid4() + gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + gateway.to_be_deleted = True + original_last_processed_at = gateway.last_processed_at + await session.commit() + + with ( + patch( + "dstack._internal.server.services.backends.get_project_backend_by_type_or_error" + ) as get_backend_mock, + patch( + "dstack._internal.server.background.pipeline_tasks.gateways.gateway_connections_pool.remove" + ) as remove_connection_mock, + ): + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.terminate_gateway.side_effect = BackendError( + "Terminate failed" + ) + get_backend_mock.return_value = backend_mock + + await worker.process(_gateway_to_pipeline_item(gateway)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.terminate_gateway.assert_called_once() + remove_connection_mock.assert_not_called() + + await session.refresh(gateway) + await session.refresh(gateway_compute) + assert gateway.to_be_deleted is True + assert gateway.last_processed_at > original_last_processed_at + assert gateway_compute.active is True + assert gateway_compute.deleted is False + events = await list_events(session) + assert len(events) == 0 diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py b/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py index 87cab83e12..7baed58b64 100644 --- a/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py +++ b/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py @@ -7,6 +7,7 @@ from dstack._internal.server.background.pipeline_tasks.base import PipelineItem from dstack._internal.server.background.pipeline_tasks.placement_groups import PlacementGroupWorker +from dstack._internal.server.models import PlacementGroupModel from dstack._internal.server.testing.common import ( ComputeMockSpec, create_fleet, @@ -20,7 +21,7 @@ def worker() -> PlacementGroupWorker: return PlacementGroupWorker(queue=Mock(), heartbeater=Mock()) -def _placement_group_to_pipeline_item(placement_group) -> PipelineItem: +def _placement_group_to_pipeline_item(placement_group: PlacementGroupModel) -> PipelineItem: assert placement_group.lock_token is not None assert placement_group.lock_expires_at is not None return PipelineItem( diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_gateways.py b/src/tests/_internal/server/background/scheduled_tasks/test_gateways.py index 5f19d2cfcd..b97abe9143 100644 --- a/src/tests/_internal/server/background/scheduled_tasks/test_gateways.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_gateways.py @@ -1,11 +1,14 @@ from unittest.mock import MagicMock, Mock, patch import pytest +from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload from dstack._internal.core.errors import BackendError from dstack._internal.core.models.gateways import GatewayProvisioningData, GatewayStatus from dstack._internal.server.background.scheduled_tasks.gateways import process_gateways +from dstack._internal.server.models import GatewayModel from dstack._internal.server.testing.common import ( AsyncContextManager, ComputeMockSpec, @@ -44,6 +47,12 @@ async def test_submitted_to_provisioning(self, test_db, session: AsyncSession): m.assert_called_once() aws.compute.return_value.create_gateway.assert_called_once() await session.refresh(gateway) + res = await session.execute( + select(GatewayModel) + .where(GatewayModel.id == gateway.id) + .options(joinedload(GatewayModel.gateway_compute)) + ) + gateway = res.unique().scalar_one() assert gateway.status == GatewayStatus.PROVISIONING assert gateway.gateway_compute is not None assert gateway.gateway_compute.ip_address == "2.2.2.2" diff --git a/src/tests/_internal/server/routers/test_gateways.py b/src/tests/_internal/server/routers/test_gateways.py index f80537a1b1..a0f7566bff 100644 --- a/src/tests/_internal/server/routers/test_gateways.py +++ b/src/tests/_internal/server/routers/test_gateways.py @@ -20,6 +20,15 @@ list_events, ) from dstack._internal.server.testing.matchers import SomeUUID4Str +from dstack._internal.settings import FeatureFlags + + +@pytest.fixture +def patch_pipeline_processing_flag(monkeypatch: pytest.MonkeyPatch): + def _apply(enabled: bool): + monkeypatch.setattr(FeatureFlags, "PIPELINE_PROCESSING_ENABLED", enabled) + + return _apply class TestListAndGetGateways: @@ -455,9 +464,88 @@ async def test_only_admin_can_delete( ) assert response.status_code == 403 + +class TestDeleteGatewayPipelineEnabled: + @pytest.fixture(autouse=True) + def _pipeline_processing_enabled(self, patch_pipeline_processing_flag): + patch_pipeline_processing_flag(True) + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) - async def test_delete_gateway(self, test_db, session: AsyncSession, client: AsyncClient): + async def test_marks_gateways_to_be_deleted( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.ADMIN + ) + backend_aws = await create_backend(session, project.id) + backend_gcp = await create_backend(session, project.id, backend_type=BackendType.GCP) + gateway_compute_aws = await create_gateway_compute( + session=session, + backend_id=backend_aws.id, + ) + gateway_aws = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend_aws.id, + name="gateway-aws", + gateway_compute_id=gateway_compute_aws.id, + ) + gateway_compute_gcp = await create_gateway_compute( + session=session, + backend_id=backend_gcp.id, + ) + gateway_gcp = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend_gcp.id, + name="gateway-gcp", + gateway_compute_id=gateway_compute_gcp.id, + ) + response = await client.post( + f"/api/project/{project.name}/gateways/delete", + json={"names": [gateway_aws.name, gateway_gcp.name]}, + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + + await session.refresh(gateway_aws) + await session.refresh(gateway_gcp) + await session.refresh(gateway_compute_aws) + await session.refresh(gateway_compute_gcp) + assert gateway_aws.to_be_deleted is True + assert gateway_gcp.to_be_deleted is True + assert gateway_compute_aws.active is True + assert gateway_compute_aws.deleted is False + assert gateway_compute_gcp.active is True + assert gateway_compute_gcp.deleted is False + + response = await client.post( + f"/api/project/{project.name}/gateways/list", + headers=get_auth_headers(user.token), + ) + assert response.status_code == 200 + assert {g["name"] for g in response.json()} == {"gateway-aws", "gateway-gcp"} + + events = await list_events(session) + assert len(events) == 2 + assert all(e.message == "Gateway marked for deletion" for e in events) + assert {e.targets[0].entity_name for e in events} == {"gateway-aws", "gateway-gcp"} + assert all(e.actor_user_id == user.id for e in events) + + +class TestDeleteGatewayPipelineDisabled: + @pytest.fixture(autouse=True) + def _pipeline_processing_disabled(self, patch_pipeline_processing_flag): + patch_pipeline_processing_flag(False) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_deletes_gateways_synchronously( + self, test_db, session: AsyncSession, client: AsyncClient + ): user = await create_user(session, global_role=GlobalRole.USER) project = await create_project(session) await add_project_member( @@ -545,6 +633,7 @@ def get_backend(project, backend_type): }, } ] + events = await list_events(session) assert len(events) == 1 assert events[0].message == "Gateway deleted" From 798f6bcbe32966d5ffbbedbfa196bdf79a967d9b Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Mon, 23 Feb 2026 11:43:45 +0000 Subject: [PATCH 162/187] Allow detecting whether service `https` is unset (#3601) Make the `https` service configuration property `Optional`. This allows to determine whether the property was omitted or explicitly set by the user. In a future version, we could use that to improve validation or change the default on the server. For now, the behavior is unchanged - an unset `https` is equivalent to `https: true`. Backward compatibility is preserved for the most part, except two side effects: - Users may see a phantom `https` change in the run plan when redeploying a service after upgrading to CLI 0.20.12+. This, however, will not block the rolling deployment and will not cause any actual changes to the service behavior. - Users with a pre-0.20.12 CLI will not be able to trigger a rolling deployment on a service deployed with a 0.20.12+ CLI and will see the `Failed to apply plan. Resource has been changed` error message. --- src/dstack/_internal/core/compatibility/runs.py | 2 ++ src/dstack/_internal/core/models/configurations.py | 7 ++++--- src/dstack/_internal/server/compatibility/runs.py | 9 ++++++++- src/dstack/_internal/server/services/runs/spec.py | 14 +++++++++++++- .../_internal/server/services/services/__init__.py | 13 +++++++++---- .../server/services/services/test_services.py | 6 +++--- 6 files changed, 39 insertions(+), 12 deletions(-) diff --git a/src/dstack/_internal/core/compatibility/runs.py b/src/dstack/_internal/core/compatibility/runs.py index 4ece12392c..d8330031b5 100644 --- a/src/dstack/_internal/core/compatibility/runs.py +++ b/src/dstack/_internal/core/compatibility/runs.py @@ -78,6 +78,8 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType: configuration_excludes["router"] = True elif isinstance(router, SGLangServiceRouterConfig) and router.pd_disaggregation is False: configuration_excludes["router"] = {"pd_disaggregation": True} + if run_spec.configuration.https is None: + configuration_excludes["https"] = True if configuration_excludes: spec_excludes["configuration"] = configuration_excludes diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 87e38b496a..8c86fd5bd9 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -857,12 +857,13 @@ class ServiceConfigurationParams(CoreModel): ), ] = None https: Annotated[ - Union[bool, Literal["auto"]], + Optional[Union[bool, Literal["auto"]]], Field( description="Enable HTTPS if running with a gateway." - " Set to `auto` to determine automatically based on gateway configuration" + " Set to `auto` to determine automatically based on gateway configuration." + f" Defaults to `{str(SERVICE_HTTPS_DEFAULT).lower()}`" ), - ] = SERVICE_HTTPS_DEFAULT + ] = None auth: Annotated[bool, Field(description="Enable the authorization")] = True scaling: Annotated[ diff --git a/src/dstack/_internal/server/compatibility/runs.py b/src/dstack/_internal/server/compatibility/runs.py index 2e715ce162..77df3f5fae 100644 --- a/src/dstack/_internal/server/compatibility/runs.py +++ b/src/dstack/_internal/server/compatibility/runs.py @@ -2,7 +2,7 @@ from packaging.version import Version -from dstack._internal.core.models.configurations import ServiceConfiguration +from dstack._internal.core.models.configurations import SERVICE_HTTPS_DEFAULT, ServiceConfiguration from dstack._internal.core.models.runs import Run, RunPlan, RunSpec from dstack._internal.server.compatibility.common import patch_offers_list @@ -34,3 +34,10 @@ def patch_run_spec(run_spec: RunSpec, client_version: Optional[Version]) -> None ): if run_spec.configuration.probes is None: run_spec.configuration.probes = [] + # Clients prior to 0.20.12 do not support https = None + if ( + client_version < Version("0.20.12") + and isinstance(run_spec.configuration, ServiceConfiguration) + and run_spec.configuration.https is None + ): + run_spec.configuration.https = SERVICE_HTTPS_DEFAULT diff --git a/src/dstack/_internal/server/services/runs/spec.py b/src/dstack/_internal/server/services/runs/spec.py index a18f151ce1..8623fd0a1e 100644 --- a/src/dstack/_internal/server/services/runs/spec.py +++ b/src/dstack/_internal/server/services/runs/spec.py @@ -1,5 +1,9 @@ from dstack._internal.core.errors import ServerClientError -from dstack._internal.core.models.configurations import RUN_PRIORITY_DEFAULT, ServiceConfiguration +from dstack._internal.core.models.configurations import ( + RUN_PRIORITY_DEFAULT, + SERVICE_HTTPS_DEFAULT, + ServiceConfiguration, +) from dstack._internal.core.models.repos.virtual import DEFAULT_VIRTUAL_REPO_ID, VirtualRunRepoData from dstack._internal.core.models.runs import LEGACY_REPO_DIR, AnyRunConfiguration, RunSpec from dstack._internal.core.models.volumes import InstanceMountPoint @@ -203,6 +207,14 @@ def _check_can_update_configuration( # Currently, the client preserves the original file/dir name it the tarball, but it could # use some generic names like "file"/"directory" instead. updatable_fields.append("files") + if ( + isinstance(current, ServiceConfiguration) + and isinstance(new, ServiceConfiguration) + and current.https in (None, SERVICE_HTTPS_DEFAULT) + and new.https in (None, SERVICE_HTTPS_DEFAULT) + ): + # Allow switching between `https: ` and unset `https`. Has no effect. + updatable_fields.append("https") diff = diff_models(current, new) changed_fields = list(diff.keys()) for key in changed_fields: diff --git a/src/dstack/_internal/server/services/services/__init__.py b/src/dstack/_internal/server/services/services/__init__.py index 3846f5d6a8..2a730b5695 100644 --- a/src/dstack/_internal/server/services/services/__init__.py +++ b/src/dstack/_internal/server/services/services/__init__.py @@ -22,6 +22,7 @@ ) from dstack._internal.core.models.configurations import ( DEFAULT_REPLICA_GROUP_NAME, + SERVICE_HTTPS_DEFAULT, ServiceConfiguration, ) from dstack._internal.core.models.gateways import GatewayConfiguration, GatewayStatus @@ -251,11 +252,13 @@ def _register_service_in_server(run_model: RunModel, run_spec: RunSpec) -> Servi "Service with SGLang router configuration requires a gateway. " "Please configure a gateway with the SGLang router enabled." ) - if run_spec.configuration.https is False: - # Note: if the user sets `https: `, it will be ignored silently - # TODO: in 0.19, make `https` Optional to be able to tell if it was set or omitted + if run_spec.configuration.https not in ( + None, + "auto", + True, # Default set by pre-0.20.12 clients. TODO(0.21.0?): forbid True too. + ): raise ServerClientError( - "The `https` configuration property is not applicable when running services without a gateway." + f"Setting `https: {run_spec.configuration.https}` is not allowed without a gateway." " Please configure a gateway or remove the `https` property from the service configuration" ) # Check if any group has autoscaling (min != max) @@ -427,6 +430,8 @@ async def unregister_replica(session: AsyncSession, job_model: JobModel): def _get_service_https(run_spec: RunSpec, configuration: GatewayConfiguration) -> bool: assert run_spec.configuration.type == "service" https = run_spec.configuration.https + if https is None: + https = SERVICE_HTTPS_DEFAULT if https == "auto": if configuration.certificate is None: return False diff --git a/src/tests/_internal/server/services/services/test_services.py b/src/tests/_internal/server/services/services/test_services.py index 1ee3bf70e4..f028055b8d 100644 --- a/src/tests/_internal/server/services/services/test_services.py +++ b/src/tests/_internal/server/services/services/test_services.py @@ -45,9 +45,9 @@ def _mock_run_model() -> MagicMock: class TestServiceConfigurationHttps: - def test_default_is_true(self) -> None: + def test_accepts_unset(self) -> None: conf = ServiceConfiguration(commands=["python serve.py"], port=8000) - assert conf.https is True + assert conf.https is None def test_accepts_auto(self) -> None: conf = ServiceConfiguration(commands=["python serve.py"], port=8000, https="auto") @@ -96,5 +96,5 @@ def test_allows_auto_without_gateway(self) -> None: def test_rejects_explicit_false_without_gateway(self) -> None: run_spec = _service_run_spec(https=False) - with pytest.raises(ServerClientError, match="not applicable"): + with pytest.raises(ServerClientError, match="not allowed without a gateway"): _register_service_in_server(_mock_run_model(), run_spec) From d1fd2387a131787c4f12ccdae5c96e320249279d Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Tue, 24 Feb 2026 16:34:31 +0500 Subject: [PATCH 163/187] Implement volume pipeline (#3604) * Fix process_submitted_volumes selecting deleted volumes * Implement pipeline for submitted volumes * Implement delete volume async API * Add TestVolumeWorkerDeleted * Make process_idle_volumes work with pipelines * Handle locked volumes when attaching * Test idle volumes with pipelines * Add index ix_volumes_pipeline_fetch_q * Add sentry instrumentation for pipeline tasks * Instrument all pipelines * Fix to_be_deleted server_default --- src/dstack/_internal/core/models/volumes.py | 2 + .../background/pipeline_tasks/__init__.py | 2 + .../pipeline_tasks/compute_groups.py | 3 + .../background/pipeline_tasks/gateways.py | 3 + .../pipeline_tasks/placement_groups.py | 5 + .../background/pipeline_tasks/volumes.py | 448 ++++++++++++++++++ .../background/scheduled_tasks/__init__.py | 6 +- .../scheduled_tasks/compute_groups.py | 2 +- .../background/scheduled_tasks/events.py | 2 +- .../background/scheduled_tasks/fleets.py | 2 +- .../background/scheduled_tasks/gateways.py | 2 +- .../scheduled_tasks/idle_volumes.py | 43 +- .../background/scheduled_tasks/instances.py | 4 +- .../background/scheduled_tasks/metrics.py | 4 +- .../scheduled_tasks/placement_groups.py | 2 +- .../scheduled_tasks/prometheus_metrics.py | 4 +- .../scheduled_tasks/running_jobs.py | 2 +- .../server/background/scheduled_tasks/runs.py | 2 +- .../scheduled_tasks/submitted_jobs.py | 21 +- .../scheduled_tasks/terminating_jobs.py | 2 +- .../background/scheduled_tasks/volumes.py | 3 +- ...ac7924_add_volumemodel_pipeline_columns.py | 53 +++ ...4_add_ix_volumes_pipeline_fetch_q_index.py | 50 ++ src/dstack/_internal/server/models.py | 14 +- .../_internal/server/routers/volumes.py | 3 + .../server/services/jobs/__init__.py | 6 +- .../_internal/server/services/volumes.py | 128 ++++- .../_internal/server/utils/sentry_utils.py | 20 +- .../background/pipeline_tasks/test_volumes.py | 336 +++++++++++++ .../scheduled_tasks/test_idle_volumes.py | 167 ++++++- .../scheduled_tasks/test_submitted_jobs.py | 73 +++ .../_internal/server/services/test_volumes.py | 5 +- 32 files changed, 1369 insertions(+), 50 deletions(-) create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/volumes.py create mode 100644 src/dstack/_internal/server/migrations/versions/2026/02_23_1134_ccfac6ac7924_add_volumemodel_pipeline_columns.py create mode 100644 src/dstack/_internal/server/migrations/versions/2026/02_24_0945_9a363c3cbe04_add_ix_volumes_pipeline_fetch_q_index.py create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_volumes.py diff --git a/src/dstack/_internal/core/models/volumes.py b/src/dstack/_internal/core/models/volumes.py index 0f89b770f0..280ab14f10 100644 --- a/src/dstack/_internal/core/models/volumes.py +++ b/src/dstack/_internal/core/models/volumes.py @@ -17,6 +17,8 @@ class VolumeStatus(str, Enum): SUBMITTED = "submitted" + # PROVISIONING is currently not used since on all backends supporting volumes, + # volumes become ACTIVE (ready to be used) almost immediately after provisioning. PROVISIONING = "provisioning" ACTIVE = "active" FAILED = "failed" diff --git a/src/dstack/_internal/server/background/pipeline_tasks/__init__.py b/src/dstack/_internal/server/background/pipeline_tasks/__init__.py index 01feb958d0..d9f67680ce 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/__init__.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/__init__.py @@ -6,6 +6,7 @@ from dstack._internal.server.background.pipeline_tasks.placement_groups import ( PlacementGroupPipeline, ) +from dstack._internal.server.background.pipeline_tasks.volumes import VolumePipeline from dstack._internal.utils.logging import get_logger logger = get_logger(__name__) @@ -17,6 +18,7 @@ def __init__(self) -> None: ComputeGroupPipeline(), GatewayPipeline(), PlacementGroupPipeline(), + VolumePipeline(), ] self._hinter = PipelineHinter(self._pipelines) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py b/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py index 938c6013c8..33e839b8b6 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py @@ -27,6 +27,7 @@ from dstack._internal.server.services.compute_groups import compute_group_model_to_compute_group from dstack._internal.server.services.instances import emit_instance_status_change_event from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.utils import sentry_utils from dstack._internal.utils.common import get_current_datetime, run_async from dstack._internal.utils.logging import get_logger @@ -107,6 +108,7 @@ def __init__( queue_check_delay=queue_check_delay, ) + @sentry_utils.instrument_named_task("pipeline_tasks.ComputeGroupFetcher.fetch") async def fetch(self, limit: int) -> list[PipelineItem]: compute_group_lock, _ = get_locker(get_db().dialect_name).get_lockset( ComputeGroupModel.__tablename__ @@ -172,6 +174,7 @@ def __init__( heartbeater=heartbeater, ) + @sentry_utils.instrument_named_task("pipeline_tasks.ComputeGroupWorker.process") async def process(self, item: PipelineItem): async with get_session_ctx() as session: res = await session.execute( diff --git a/src/dstack/_internal/server/background/pipeline_tasks/gateways.py b/src/dstack/_internal/server/background/pipeline_tasks/gateways.py index c64cd719a2..cdd0904e1a 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/gateways.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/gateways.py @@ -35,6 +35,7 @@ from dstack._internal.server.services.gateways.pool import gateway_connections_pool from dstack._internal.server.services.locking import get_locker from dstack._internal.server.services.logging import fmt +from dstack._internal.server.utils import sentry_utils from dstack._internal.utils.common import get_current_datetime, run_async from dstack._internal.utils.logging import get_logger @@ -118,6 +119,7 @@ def __init__( queue_check_delay=queue_check_delay, ) + @sentry_utils.instrument_named_task("pipeline_tasks.GatewayFetcher.fetch") async def fetch(self, limit: int) -> list[GatewayPipelineItem]: gateway_lock, _ = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__) async with gateway_lock: @@ -193,6 +195,7 @@ def __init__( heartbeater=heartbeater, ) + @sentry_utils.instrument_named_task("pipeline_tasks.GatewayWorker.process") async def process(self, item: GatewayPipelineItem): if item.to_be_deleted: await _process_to_be_deleted_item(item) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py b/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py index a184379c37..193358ec0f 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py @@ -26,6 +26,7 @@ from dstack._internal.server.services import backends as backends_services from dstack._internal.server.services.locking import get_locker from dstack._internal.server.services.placement import placement_group_model_to_placement_group +from dstack._internal.server.utils import sentry_utils from dstack._internal.utils.common import get_current_datetime, run_async from dstack._internal.utils.logging import get_logger @@ -103,6 +104,7 @@ def __init__( queue_check_delay=queue_check_delay, ) + @sentry_utils.instrument_named_task("pipeline_tasks.PlacementGroupFetcher.fetch") async def fetch(self, limit: int) -> list[PipelineItem]: placement_group_lock, _ = get_locker(get_db().dialect_name).get_lockset( PlacementGroupModel.__tablename__ @@ -170,6 +172,7 @@ def __init__( heartbeater=heartbeater, ) + @sentry_utils.instrument_named_task("pipeline_tasks.PlacementGroupWorker.process") async def process(self, item: PipelineItem): async with get_session_ctx() as session: res = await session.execute( @@ -230,6 +233,7 @@ async def _delete_placement_group(placement_group_model: PlacementGroupModel) -> backend_type=placement_group.provisioning_data.backend, ) if backend is None: + # TODO: Retry deletion logger.error( "Failed to delete placement group %s. Backend not available. Please delete it manually.", placement_group.name, @@ -245,6 +249,7 @@ async def _delete_placement_group(placement_group_model: PlacementGroupModel) -> ) return {} except Exception: + # TODO: Retry deletion logger.exception( "Got exception when deleting placement group %s. Please delete it manually.", placement_group.name, diff --git a/src/dstack/_internal/server/background/pipeline_tasks/volumes.py b/src/dstack/_internal/server/background/pipeline_tasks/volumes.py new file mode 100644 index 0000000000..578fe8423b --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/volumes.py @@ -0,0 +1,448 @@ +import asyncio +import uuid +from dataclasses import dataclass, field +from datetime import timedelta +from typing import Sequence + +from sqlalchemy import or_, select, update +from sqlalchemy.orm import joinedload, load_only + +from dstack._internal.core.backends.base.compute import ComputeWithVolumeSupport +from dstack._internal.core.errors import BackendError, BackendNotAvailable +from dstack._internal.core.models.volumes import VolumeStatus +from dstack._internal.server.background.pipeline_tasks.base import ( + Fetcher, + Heartbeater, + Pipeline, + PipelineItem, + UpdateMap, + Worker, + get_processed_update_map, + get_unlock_update_map, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + FleetModel, + InstanceModel, + ProjectModel, + UserModel, + VolumeAttachmentModel, + VolumeModel, +) +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services import events +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.volumes import ( + emit_volume_status_change_event, + volume_model_to_volume, +) +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class VolumePipelineItem(PipelineItem): + status: VolumeStatus + to_be_deleted: bool + + +class VolumePipeline(Pipeline[VolumePipelineItem]): + def __init__( + self, + workers_num: int = 10, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=15), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[VolumePipelineItem]( + model_type=VolumeModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = VolumeFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + VolumeWorker(queue=self._queue, heartbeater=self._heartbeater) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return VolumeModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[VolumePipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[VolumePipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["VolumeWorker"]: + return self.__workers + + +class VolumeFetcher(Fetcher[VolumePipelineItem]): + def __init__( + self, + queue: asyncio.Queue[VolumePipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[VolumePipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_named_task("pipeline_tasks.VolumeFetcher.fetch") + async def fetch(self, limit: int) -> list[VolumePipelineItem]: + volume_lock, _ = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__) + async with volume_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(VolumeModel) + .where( + or_( + VolumeModel.status == VolumeStatus.SUBMITTED, + VolumeModel.to_be_deleted == True, + ), + VolumeModel.deleted == False, + or_( + VolumeModel.last_processed_at <= now - self._min_processing_interval, + VolumeModel.last_processed_at == VolumeModel.created_at, + ), + or_( + VolumeModel.lock_expires_at.is_(None), + VolumeModel.lock_expires_at < now, + ), + or_( + VolumeModel.lock_owner.is_(None), + VolumeModel.lock_owner == VolumePipeline.__name__, + ), + ) + .order_by(VolumeModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True) + .options( + load_only( + VolumeModel.id, + VolumeModel.lock_token, + VolumeModel.lock_expires_at, + VolumeModel.status, + VolumeModel.to_be_deleted, + ) + ) + ) + volume_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for volume_model in volume_models: + prev_lock_expired = volume_model.lock_expires_at is not None + volume_model.lock_expires_at = lock_expires_at + volume_model.lock_token = lock_token + volume_model.lock_owner = VolumePipeline.__name__ + items.append( + VolumePipelineItem( + __tablename__=VolumeModel.__tablename__, + id=volume_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + status=volume_model.status, + to_be_deleted=volume_model.to_be_deleted, + ) + ) + await session.commit() + return items + + +class VolumeWorker(Worker[VolumePipelineItem]): + def __init__( + self, + queue: asyncio.Queue[VolumePipelineItem], + heartbeater: Heartbeater[VolumePipelineItem], + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + ) + + @sentry_utils.instrument_named_task("pipeline_tasks.VolumeWorker.process") + async def process(self, item: VolumePipelineItem): + if item.to_be_deleted: + await _process_to_be_deleted_item(item) + elif item.status == VolumeStatus.SUBMITTED: + await _process_submitted_item(item) + elif item.status == VolumeStatus.ACTIVE: + pass + + +async def _process_submitted_item(item: VolumePipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(VolumeModel) + .where( + VolumeModel.id == item.id, + VolumeModel.lock_token == item.lock_token, + ) + .options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(VolumeModel.user)) + .options( + joinedload(VolumeModel.attachments) + .joinedload(VolumeAttachmentModel.instance) + .joinedload(InstanceModel.fleet) + .load_only(FleetModel.name) + ) + ) + volume_model = res.unique().scalar_one_or_none() + if volume_model is None: + logger.warning( + "Failed to process %s item %s: lock_token mismatch." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + + result = await _process_submitted_volume(volume_model) + update_map = result.update_map | get_processed_update_map() | get_unlock_update_map() + async with get_session_ctx() as session: + res = await session.execute( + update(VolumeModel) + .where( + VolumeModel.id == volume_model.id, + VolumeModel.lock_token == volume_model.lock_token, + ) + .values(**update_map) + .returning(VolumeModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + logger.warning( + "Failed to update %s item %s after processing: lock_token changed." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + # TODO: Clean up volume. + return + emit_volume_status_change_event( + session=session, + volume_model=volume_model, + old_status=volume_model.status, + new_status=update_map.get("status", volume_model.status), + status_message=update_map.get("status_message", volume_model.status_message), + ) + + +@dataclass +class _SubmittedResult: + update_map: UpdateMap = field(default_factory=dict) + + +async def _process_submitted_volume(volume_model: VolumeModel) -> _SubmittedResult: + volume = volume_model_to_volume(volume_model) + try: + backend = await backends_services.get_project_backend_by_type_or_error( + project=volume_model.project, + backend_type=volume.configuration.backend, + overrides=True, + ) + except BackendNotAvailable: + logger.error( + "Failed to process volume %s. Backend %s not available.", + volume.name, + volume.configuration.backend.value, + ) + return _SubmittedResult( + update_map={ + "status": VolumeStatus.FAILED, + "status_message": "Backend not available", + } + ) + + compute = backend.compute() + assert isinstance(compute, ComputeWithVolumeSupport) + try: + if volume.configuration.volume_id is not None: + logger.info("Registering external volume %s", volume_model.name) + vpd = await run_async( + compute.register_volume, + volume=volume, + ) + else: + logger.info("Provisioning new volume %s", volume_model.name) + vpd = await run_async( + compute.create_volume, + volume=volume, + ) + except BackendError as e: + logger.info("Failed to create volume %s: %s", volume_model.name, repr(e)) + status_message = f"Backend error: {repr(e)}" + if len(e.args) > 0: + status_message = str(e.args[0]) + return _SubmittedResult( + update_map={ + "status": VolumeStatus.FAILED, + "status_message": status_message, + } + ) + except Exception as e: + logger.exception("Got exception when creating volume %s", volume_model.name) + return _SubmittedResult( + update_map={ + "status": VolumeStatus.FAILED, + "status_message": f"Unexpected error: {repr(e)}", + } + ) + + logger.info("Added new volume %s", volume_model.name) + # Provisioned volumes marked as active since they become available almost immediately in AWS + # TODO: Consider checking volume state + return _SubmittedResult( + update_map={ + "status": VolumeStatus.ACTIVE, + "volume_provisioning_data": vpd.json(), + } + ) + + +async def _process_to_be_deleted_item(item: VolumePipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(VolumeModel) + .where( + VolumeModel.id == item.id, + VolumeModel.lock_token == item.lock_token, + ) + .options(joinedload(VolumeModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(VolumeModel.user).load_only(UserModel.name)) + .options( + joinedload(VolumeModel.attachments) + .joinedload(VolumeAttachmentModel.instance) + .joinedload(InstanceModel.fleet) + .load_only(FleetModel.name) + ) + ) + volume_model = res.unique().scalar_one_or_none() + if volume_model is None: + logger.warning( + "Failed to process %s item %s: lock_token mismatch." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + + result = await _process_to_be_deleted_volume(volume_model) + update_map = result.update_map | get_unlock_update_map() + async with get_session_ctx() as session: + res = await session.execute( + update(VolumeModel) + .where( + VolumeModel.id == volume_model.id, + VolumeModel.lock_token == volume_model.lock_token, + ) + .values(**update_map) + .returning(VolumeModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + logger.warning( + "Failed to update %s item %s after processing: lock_token changed." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + events.emit( + session, + "Volume deleted", + actor=events.SystemActor(), + targets=[events.Target.from_model(volume_model)], + ) + + +@dataclass +class _DeletedResult: + update_map: UpdateMap = field(default_factory=dict) + + +async def _process_to_be_deleted_volume(volume_model: VolumeModel) -> _DeletedResult: + volume = volume_model_to_volume(volume_model) + if volume.external: + return _get_deleted_result() + if volume.provisioning_data is None: + # The volume wasn't provisioned so there is nothing to delete + return _get_deleted_result() + if volume.provisioning_data.backend is None: + logger.error( + f"Failed to delete volume {volume_model.name}. volume.provisioning_data.backend is None." + ) + return _get_deleted_result() + try: + backend = await backends_services.get_project_backend_by_type_or_error( + project=volume_model.project, + backend_type=volume.provisioning_data.backend, + ) + except BackendNotAvailable: + # TODO: Retry deletion + logger.error( + f"Failed to delete volume {volume_model.name}. Backend {volume.configuration.backend} not available." + " Please terminate it manually to avoid unexpected charges.", + ) + return _get_deleted_result() + + compute = backend.compute() + assert isinstance(compute, ComputeWithVolumeSupport) + try: + await run_async( + compute.delete_volume, + volume=volume, + ) + except Exception: + # TODO: Retry deletion + logger.exception( + "Got exception when deleting volume %s. Please terminate it manually to avoid unexpected charges.", + volume.name, + ) + return _get_deleted_result() + + +def _get_deleted_result() -> _DeletedResult: + now = get_current_datetime() + return _DeletedResult( + update_map={ + "last_processed_at": now, + "deleted": True, + "deleted_at": now, + } + ) diff --git a/src/dstack/_internal/server/background/scheduled_tasks/__init__.py b/src/dstack/_internal/server/background/scheduled_tasks/__init__.py index 6067d9d4de..45ae8ec7fd 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/__init__.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/__init__.py @@ -99,9 +99,6 @@ def start_scheduled_tasks() -> AsyncIOScheduler: ) _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1) _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15)) - _scheduler.add_job( - process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5 - ) _scheduler.add_job( process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1 ) @@ -116,6 +113,9 @@ def start_scheduled_tasks() -> AsyncIOScheduler: process_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5 ) _scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5)) + _scheduler.add_job( + process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5 + ) for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR): # Add multiple copies of tasks if requested. # max_instances=1 for additional copies to avoid running too many tasks. diff --git a/src/dstack/_internal/server/background/scheduled_tasks/compute_groups.py b/src/dstack/_internal/server/background/scheduled_tasks/compute_groups.py index 6b449efab4..feb1cc5070 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/compute_groups.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/compute_groups.py @@ -39,7 +39,7 @@ async def process_compute_groups(batch_size: int = 1): await asyncio.gather(*tasks) -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def _process_next_compute_group(): lock, lockset = get_locker(get_db().dialect_name).get_lockset(ComputeGroupModel.__tablename__) async with get_session_ctx() as session: diff --git a/src/dstack/_internal/server/background/scheduled_tasks/events.py b/src/dstack/_internal/server/background/scheduled_tasks/events.py index 22df5bcf33..1fbf602176 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/events.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/events.py @@ -9,7 +9,7 @@ from dstack._internal.utils.common import get_current_datetime -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def delete_events(): cutoff = get_current_datetime() - timedelta(seconds=settings.SERVER_EVENTS_TTL_SECONDS) stmt = delete(EventModel).where(EventModel.recorded_at < cutoff) diff --git a/src/dstack/_internal/server/background/scheduled_tasks/fleets.py b/src/dstack/_internal/server/background/scheduled_tasks/fleets.py index 50c3dcfe2a..a758f86ada 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/fleets.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/fleets.py @@ -39,7 +39,7 @@ MIN_PROCESSING_INTERVAL = timedelta(seconds=30) -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def process_fleets(): fleet_lock, fleet_lockset = get_locker(get_db().dialect_name).get_lockset( FleetModel.__tablename__ diff --git a/src/dstack/_internal/server/background/scheduled_tasks/gateways.py b/src/dstack/_internal/server/background/scheduled_tasks/gateways.py index 3b6bee012e..fc12e8e3b8 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/gateways.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/gateways.py @@ -35,7 +35,7 @@ async def process_gateways_connections(): await _process_active_connections() -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def process_gateways(): lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__) async with get_session_ctx() as session: diff --git a/src/dstack/_internal/server/background/scheduled_tasks/idle_volumes.py b/src/dstack/_internal/server/background/scheduled_tasks/idle_volumes.py index cd5b66bc70..c770390131 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/idle_volumes.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/idle_volumes.py @@ -19,6 +19,7 @@ volume_model_to_volume, ) from dstack._internal.server.utils import sentry_utils +from dstack._internal.settings import FeatureFlags from dstack._internal.utils import common from dstack._internal.utils.common import get_current_datetime from dstack._internal.utils.logging import get_logger @@ -26,7 +27,7 @@ logger = get_logger(__name__) -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def process_idle_volumes(): lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__) async with get_session_ctx() as session: @@ -35,7 +36,9 @@ async def process_idle_volumes(): select(VolumeModel.id) .where( VolumeModel.status == VolumeStatus.ACTIVE, + VolumeModel.auto_cleanup_enabled.is_not(False), VolumeModel.deleted == False, + VolumeModel.lock_expires_at.is_(None), VolumeModel.id.not_in(lockset), ) .order_by(VolumeModel.last_processed_at.asc()) @@ -90,23 +93,31 @@ def _get_idle_time(volume: VolumeModel) -> datetime.timedelta: async def _delete_idle_volumes(session: AsyncSession, volumes: List[VolumeModel]): - # Note: Multiple volumes are deleted in the same transaction, - # so long deletion of one volume may block processing other volumes. for volume_model in volumes: logger.info("Deleting idle volume %s", volume_model.name) - try: - await _delete_idle_volume(session, volume_model) - except Exception: - logger.exception("Error when deleting idle volume %s", volume_model.name) - - volume_model.deleted = True - volume_model.deleted_at = get_current_datetime() - events.emit( - session=session, - message="Volume deleted due to exceeding auto_cleanup_duration", - actor=events.SystemActor(), - targets=[events.Target.from_model(volume_model)], - ) + if FeatureFlags.PIPELINE_PROCESSING_ENABLED: + volume_model.to_be_deleted = True + events.emit( + session=session, + message="Volume marked for deletion due to exceeding auto_cleanup_duration", + actor=events.SystemActor(), + targets=[events.Target.from_model(volume_model)], + ) + else: + try: + # Note: Multiple volumes are deleted in the same transaction, + # so long deletion of one volume may block processing other volumes. + await _delete_idle_volume(session, volume_model) + except Exception: + logger.exception("Error when deleting idle volume %s", volume_model.name) + volume_model.deleted = True + volume_model.deleted_at = get_current_datetime() + events.emit( + session=session, + message="Volume deleted due to exceeding auto_cleanup_duration", + actor=events.SystemActor(), + targets=[events.Target.from_model(volume_model)], + ) await session.commit() diff --git a/src/dstack/_internal/server/background/scheduled_tasks/instances.py b/src/dstack/_internal/server/background/scheduled_tasks/instances.py index 196f347c4f..e5ecba5278 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/instances.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/instances.py @@ -152,7 +152,7 @@ async def process_instances(batch_size: int = 1): await asyncio.gather(*tasks) -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def delete_instance_health_checks(): now = get_current_datetime() cutoff = now - timedelta(seconds=server_settings.SERVER_INSTANCE_HEALTH_TTL_SECONDS) @@ -163,7 +163,7 @@ async def delete_instance_health_checks(): await session.commit() -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def _process_next_instance(): lock, lockset = get_locker(get_db().dialect_name).get_lockset(InstanceModel.__tablename__) async with get_session_ctx() as session: diff --git a/src/dstack/_internal/server/background/scheduled_tasks/metrics.py b/src/dstack/_internal/server/background/scheduled_tasks/metrics.py index ca2d25fe5f..f75c5f3eae 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/metrics.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/metrics.py @@ -27,7 +27,7 @@ MIN_COLLECT_INTERVAL_SECONDS = 9 -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def collect_metrics(): async with get_session_ctx() as session: res = await session.execute( @@ -47,7 +47,7 @@ async def collect_metrics(): await _collect_jobs_metrics(batch) -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def delete_metrics(): now_timestamp_micro = int(get_current_datetime().timestamp() * 1_000_000) running_timestamp_micro_cutoff = ( diff --git a/src/dstack/_internal/server/background/scheduled_tasks/placement_groups.py b/src/dstack/_internal/server/background/scheduled_tasks/placement_groups.py index 1f61300016..71ab51b07b 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/placement_groups.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/placement_groups.py @@ -19,7 +19,7 @@ logger = get_logger(__name__) -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def process_placement_groups(): lock, lockset = get_locker(get_db().dialect_name).get_lockset( PlacementGroupModel.__tablename__ diff --git a/src/dstack/_internal/server/background/scheduled_tasks/prometheus_metrics.py b/src/dstack/_internal/server/background/scheduled_tasks/prometheus_metrics.py index 2f8bf72142..5b039fe2ec 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/prometheus_metrics.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/prometheus_metrics.py @@ -35,7 +35,7 @@ METRICS_TTL_SECONDS = 600 -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def collect_prometheus_metrics(): now = get_current_datetime() cutoff = now - timedelta(seconds=MIN_COLLECT_INTERVAL_SECONDS) @@ -63,7 +63,7 @@ async def collect_prometheus_metrics(): await _collect_jobs_metrics(batch, now) -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def delete_prometheus_metrics(): now = get_current_datetime() cutoff = now - timedelta(seconds=METRICS_TTL_SECONDS) diff --git a/src/dstack/_internal/server/background/scheduled_tasks/running_jobs.py b/src/dstack/_internal/server/background/scheduled_tasks/running_jobs.py index f413edf44b..9d3bd04c3b 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/running_jobs.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/running_jobs.py @@ -103,7 +103,7 @@ async def process_running_jobs(batch_size: int = 1): await asyncio.gather(*tasks) -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def _process_next_running_job(): lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__) async with get_session_ctx() as session: diff --git a/src/dstack/_internal/server/background/scheduled_tasks/runs.py b/src/dstack/_internal/server/background/scheduled_tasks/runs.py index e9421e5cbd..e0c6793ce5 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/runs.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/runs.py @@ -76,7 +76,7 @@ async def process_runs(batch_size: int = 1): await asyncio.gather(*tasks) -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def _process_next_run(): run_lock, run_lockset = get_locker(get_db().dialect_name).get_lockset(RunModel.__tablename__) job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__) diff --git a/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py b/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py index 79746e9338..5d1b2e1a79 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py @@ -164,7 +164,7 @@ def _get_effective_batch_size(batch_size: int) -> int: return batch_size -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def _process_next_submitted_job(): lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__) async with get_session_ctx() as session: @@ -1042,10 +1042,15 @@ async def _attach_volumes( ) job_runtime_data.volume_names.append(volume.name) break # attach next mount point - except (ServerClientError, BackendError) as e: - logger.warning("%s: failed to attached volume: %s", fmt(job_model), repr(e)) + except ServerClientError as e: + logger.info("%s: failed to attach volume: %s", fmt(job_model), repr(e)) job_model.termination_reason = JobTerminationReason.VOLUME_ERROR - job_model.termination_reason_message = "Failed to attach volume" + job_model.termination_reason_message = f"Failed to attach volume: {e.msg}" + switch_job_status(session, job_model, JobStatus.TERMINATING) + except BackendError as e: + logger.warning("%s: failed to attach volume: %s", fmt(job_model), repr(e)) + job_model.termination_reason = JobTerminationReason.VOLUME_ERROR + job_model.termination_reason_message = f"Failed to attach volume: {str(e)}" switch_job_status(session, job_model, JobStatus.TERMINATING) except Exception: logger.exception( @@ -1053,7 +1058,7 @@ async def _attach_volumes( fmt(job_model), ) job_model.termination_reason = JobTerminationReason.VOLUME_ERROR - job_model.termination_reason_message = "Failed to attach volume" + job_model.termination_reason_message = "Failed to attach volume: unexpected error" switch_job_status(session, job_model, JobStatus.TERMINATING) finally: job_model.job_runtime_data = job_runtime_data.json() @@ -1069,10 +1074,14 @@ async def _attach_volume( compute = backend.compute() assert isinstance(compute, ComputeWithVolumeSupport) volume = volume_model_to_volume(volume_model) - # Refresh only to check if the volume wasn't deleted before the lock + # Refresh only to check if the volume wasn't deleted or marked for deletion before the lock await session.refresh(volume_model) if volume_model.deleted: raise ServerClientError("Cannot attach a deleted volume") + if volume_model.to_be_deleted: + raise ServerClientError("Cannot attach a volume marked for deletion") + if volume_model.lock_expires_at is not None: + raise ServerClientError("Cannot attach a volume locked for processing") attachment_data = await common_utils.run_async( compute.attach_volume, volume=volume, diff --git a/src/dstack/_internal/server/background/scheduled_tasks/terminating_jobs.py b/src/dstack/_internal/server/background/scheduled_tasks/terminating_jobs.py index 6a358dcd61..3749076c1a 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/terminating_jobs.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/terminating_jobs.py @@ -35,7 +35,7 @@ async def process_terminating_jobs(batch_size: int = 1): await asyncio.gather(*tasks) -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def _process_next_terminating_job(): job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__) instance_lock, instance_lockset = get_locker(get_db().dialect_name).get_lockset( diff --git a/src/dstack/_internal/server/background/scheduled_tasks/volumes.py b/src/dstack/_internal/server/background/scheduled_tasks/volumes.py index 66124619a4..a61f796947 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/volumes.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/volumes.py @@ -24,7 +24,7 @@ logger = get_logger(__name__) -@sentry_utils.instrument_background_task +@sentry_utils.instrument_scheduled_task async def process_submitted_volumes(): lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__) async with get_session_ctx() as session: @@ -33,6 +33,7 @@ async def process_submitted_volumes(): select(VolumeModel) .where( VolumeModel.status == VolumeStatus.SUBMITTED, + VolumeModel.deleted == False, VolumeModel.id.not_in(lockset), ) .order_by(VolumeModel.last_processed_at.asc()) diff --git a/src/dstack/_internal/server/migrations/versions/2026/02_23_1134_ccfac6ac7924_add_volumemodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/2026/02_23_1134_ccfac6ac7924_add_volumemodel_pipeline_columns.py new file mode 100644 index 0000000000..4034f227e0 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/02_23_1134_ccfac6ac7924_add_volumemodel_pipeline_columns.py @@ -0,0 +1,53 @@ +"""Add VolumeModel pipeline columns + +Revision ID: ccfac6ac7924 +Revises: 140331002ece +Create Date: 2026-02-23 11:34:24.731339+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "ccfac6ac7924" +down_revision = "140331002ece" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.add_column( + sa.Column("to_be_deleted", sa.Boolean(), server_default=sa.false(), nullable=False) + ) + batch_op.add_column(sa.Column("auto_cleanup_enabled", sa.Boolean(), nullable=True)) + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("volumes", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + batch_op.drop_column("auto_cleanup_enabled") + batch_op.drop_column("to_be_deleted") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/02_24_0945_9a363c3cbe04_add_ix_volumes_pipeline_fetch_q_index.py b/src/dstack/_internal/server/migrations/versions/2026/02_24_0945_9a363c3cbe04_add_ix_volumes_pipeline_fetch_q_index.py new file mode 100644 index 0000000000..1d729dbbdc --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/02_24_0945_9a363c3cbe04_add_ix_volumes_pipeline_fetch_q_index.py @@ -0,0 +1,50 @@ +"""Add ix_volumes_pipeline_fetch_q index + +Revision ID: 9a363c3cbe04 +Revises: ccfac6ac7924 +Create Date: 2026-02-24 09:45:54.068288+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "9a363c3cbe04" +down_revision = "ccfac6ac7924" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_volumes_pipeline_fetch_q", + table_name="volumes", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_volumes_pipeline_fetch_q", + "volumes", + [sa.literal_column("last_processed_at ASC")], + unique=False, + sqlite_where=sa.text("deleted = 0"), + postgresql_where=sa.text("deleted IS FALSE"), + postgresql_concurrently=True, + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_volumes_pipeline_fetch_q", + table_name="volumes", + if_exists=True, + postgresql_concurrently=True, + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index df9cf86078..a7a8ec0bd6 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -732,7 +732,7 @@ class InstanceHealthCheckModel(BaseModel): response: Mapped[str] = mapped_column(Text) -class VolumeModel(BaseModel): +class VolumeModel(PipelineModelMixin, BaseModel): __tablename__ = "volumes" id: Mapped[uuid.UUID] = mapped_column( @@ -753,6 +753,7 @@ class VolumeModel(BaseModel): last_job_processed_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) deleted: Mapped[bool] = mapped_column(Boolean, default=False) deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + to_be_deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) # NOTE: `status` must be changed only via `switch_volume_status()` status: Mapped[VolumeStatus] = mapped_column(EnumAsString(VolumeStatus, 100), index=True) @@ -760,12 +761,23 @@ class VolumeModel(BaseModel): configuration: Mapped[str] = mapped_column(Text) volume_provisioning_data: Mapped[Optional[str]] = mapped_column(Text) + # auto_cleanup_enabled is set for all new models but old models may not have it. + auto_cleanup_enabled: Mapped[Optional[bool]] = mapped_column(Boolean) attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(back_populates="volume") # Deprecated in favor of VolumeAttachmentModel.attachment_data volume_attachment_data: Mapped[Optional[str]] = mapped_column(Text) + __table_args__ = ( + Index( + "ix_volumes_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=deleted == false(), + sqlite_where=deleted == false(), + ), + ) + class VolumeAttachmentModel(BaseModel): __tablename__ = "volumes_attachments" diff --git a/src/dstack/_internal/server/routers/volumes.py b/src/dstack/_internal/server/routers/volumes.py index ead5465c48..fccb2fd476 100644 --- a/src/dstack/_internal/server/routers/volumes.py +++ b/src/dstack/_internal/server/routers/volumes.py @@ -15,6 +15,7 @@ ListVolumesRequest, ) from dstack._internal.server.security.permissions import Authenticated, ProjectMember +from dstack._internal.server.services.pipelines import PipelineHinterProtocol, get_pipeline_hinter from dstack._internal.server.utils.routers import ( CustomORJSONResponse, get_base_api_additional_responses, @@ -92,6 +93,7 @@ async def create_volume( body: CreateVolumeRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), + pipeline_hinter: PipelineHinterProtocol = Depends(get_pipeline_hinter), ): """ Creates a volume given a volume configuration. @@ -103,6 +105,7 @@ async def create_volume( project=project, user=user, configuration=body.configuration, + pipeline_hinter=pipeline_hinter, ) ) diff --git a/src/dstack/_internal/server/services/jobs/__init__.py b/src/dstack/_internal/server/services/jobs/__init__.py index 2ddadbfb1e..eb10bda5c4 100644 --- a/src/dstack/_internal/server/services/jobs/__init__.py +++ b/src/dstack/_internal/server/services/jobs/__init__.py @@ -720,6 +720,10 @@ async def get_job_configured_volume_models( ) if volume_model is None: raise ResourceNotExistsError(f"Volume {mount_point.name} not found") + if volume_model.to_be_deleted: + raise ServerClientError( + f"Volume {mount_point.name} is marked for deletion and cannot be attached" + ) mount_point_volume_models.append(volume_model) volume_models.append(mount_point_volume_models) return volume_models @@ -729,7 +733,7 @@ def check_can_attach_job_volumes(volumes: List[List[Volume]]): """ Performs basic checks if volumes can be attached. This is useful to show error ASAP (when user submits the run). - If the attachment is to fail anyway, the error will be handled when proccessing submitted jobs. + If the attachment is to fail anyway, the error will be handled when processing submitted jobs. """ if len(volumes) == 0: return diff --git a/src/dstack/_internal/server/services/volumes.py b/src/dstack/_internal/server/services/volumes.py index 49a3d79594..f0d2fc703e 100644 --- a/src/dstack/_internal/server/services/volumes.py +++ b/src/dstack/_internal/server/services/volumes.py @@ -13,6 +13,7 @@ ResourceExistsError, ServerClientError, ) +from dstack._internal.core.models.profiles import parse_duration from dstack._internal.core.models.volumes import ( Volume, VolumeAttachment, @@ -39,8 +40,10 @@ get_locker, string_to_lock_id, ) +from dstack._internal.server.services.pipelines import PipelineHinterProtocol from dstack._internal.server.services.plugins import apply_plugin_policies from dstack._internal.server.services.projects import list_user_project_models +from dstack._internal.settings import FeatureFlags from dstack._internal.utils import common, random_names from dstack._internal.utils.logging import get_logger @@ -58,13 +61,45 @@ def switch_volume_status( return volume_model.status = new_status + emit_volume_status_change_event( + session=session, + volume_model=volume_model, + old_status=old_status, + new_status=new_status, + status_message=volume_model.status_message, + actor=actor, + ) - msg = f"Volume status changed {old_status.upper()} -> {new_status.upper()}" - if volume_model.status_message is not None: - msg += f" ({volume_model.status_message})" + +def emit_volume_status_change_event( + session: AsyncSession, + volume_model: VolumeModel, + old_status: VolumeStatus, + new_status: VolumeStatus, + status_message: Optional[str], + actor: events.AnyActor = events.SystemActor(), +) -> None: + if old_status == new_status: + return + msg = get_volume_status_change_message( + old_status=old_status, + new_status=new_status, + status_message=status_message, + ) events.emit(session, msg, actor=actor, targets=[events.Target.from_model(volume_model)]) +def get_volume_status_change_message( + old_status: VolumeStatus, + new_status: VolumeStatus, + status_message: Optional[str], +) -> str: + msg = f"Volume status changed {old_status.upper()} -> {new_status.upper()}" + if status_message is not None: + msg += f" ({status_message})" + return msg + + async def list_volumes( session: AsyncSession, user: UserModel, @@ -223,6 +258,7 @@ async def create_volume( project: ProjectModel, user: UserModel, configuration: VolumeConfiguration, + pipeline_hinter: PipelineHinterProtocol, ) -> Volume: spec = await apply_plugin_policies( user=user.name, @@ -254,6 +290,7 @@ async def create_volume( else: configuration.name = await generate_volume_name(session=session, project=project) + now = common.get_current_datetime() volume_model = VolumeModel( id=uuid.uuid4(), name=configuration.name, @@ -261,7 +298,10 @@ async def create_volume( project=project, status=VolumeStatus.SUBMITTED, configuration=configuration.json(), + auto_cleanup_enabled=_get_autocleanup_enabled(configuration), attachments=[], + created_at=now, + last_processed_at=now, ) session.add(volume_model) events.emit( @@ -271,11 +311,88 @@ async def create_volume( targets=[events.Target.from_model(volume_model)], ) await session.commit() + pipeline_hinter.hint_fetch(VolumeModel.__name__) return volume_model_to_volume(volume_model) async def delete_volumes( session: AsyncSession, project: ProjectModel, names: List[str], user: UserModel +): + # Keep both delete code paths while pipeline processing is behind a feature flag: + # - pipeline path marks volumes for async deletion by VolumePipeline + # - sync path deletes volume inline for non-pipeline processing + # TODO: Drop sync path after pipeline processing is enabled by default. + if FeatureFlags.PIPELINE_PROCESSING_ENABLED: + await _delete_volumes_pipeline( + session=session, + project=project, + names=names, + user=user, + ) + else: + await _delete_volumes_sync( + session=session, + project=project, + names=names, + user=user, + ) + + +async def _delete_volumes_pipeline( + session: AsyncSession, project: ProjectModel, names: List[str], user: UserModel +): + res = await session.execute( + select(VolumeModel).where( + VolumeModel.project_id == project.id, + VolumeModel.name.in_(names), + VolumeModel.deleted == False, + ) + ) + volume_models = res.scalars().all() + volumes_ids = sorted([v.id for v in volume_models]) + await session.commit() + logger.info("Deleting volumes: %s", [v.name for v in volume_models]) + async with get_locker(get_db().dialect_name).lock_ctx(VolumeModel.__tablename__, volumes_ids): + # Refetch after lock + res = await session.execute( + select(VolumeModel) + .where( + VolumeModel.project_id == project.id, + VolumeModel.id.in_(volumes_ids), + VolumeModel.deleted == False, + VolumeModel.lock_expires_at.is_(None), + ) + .options(selectinload(VolumeModel.attachments)) + .execution_options(populate_existing=True) + .order_by(VolumeModel.id) # take locks in order + .with_for_update(key_share=True, of=VolumeModel) + ) + volume_models = res.scalars().unique().all() + if len(volume_models) != len(volumes_ids): + # TODO: Make the delete endpoint fully async so we don't need to lock and error: + # put the request in queue and process in the background. + raise ServerClientError( + "Failed to delete volumes: volumes are being processed currently. Try again later." + ) + for volume_model in volume_models: + if len(volume_model.attachments) > 0: + raise ServerClientError( + f"Failed to delete volume {volume_model.name}. Volume is in use." + ) + for volume_model in volume_models: + if not volume_model.to_be_deleted: + volume_model.to_be_deleted = True + events.emit( + session, + message="Volume marked for deletion", + actor=events.UserActor.from_user(user), + targets=[events.Target.from_model(volume_model)], + ) + await session.commit() + + +async def _delete_volumes_sync( + session: AsyncSession, project: ProjectModel, names: List[str], user: UserModel ): res = await session.execute( select(VolumeModel).where( @@ -494,3 +611,8 @@ def _get_volume_cost(volume: Volume) -> float: * volume.provisioning_data.price / _VOLUME_PRICING_PERIOD.total_seconds() ) + + +def _get_autocleanup_enabled(configuration: VolumeConfiguration) -> bool: + auto_cleanup_duration = parse_duration(configuration.auto_cleanup_duration) + return auto_cleanup_duration is not None and auto_cleanup_duration > 0 diff --git a/src/dstack/_internal/server/utils/sentry_utils.py b/src/dstack/_internal/server/utils/sentry_utils.py index 8dd7326b73..a99173b082 100644 --- a/src/dstack/_internal/server/utils/sentry_utils.py +++ b/src/dstack/_internal/server/utils/sentry_utils.py @@ -6,15 +6,29 @@ from sentry_sdk.types import Event, Hint -def instrument_background_task(f): +def instrument_scheduled_task(f): @functools.wraps(f) async def wrapper(*args, **kwargs): - with sentry_sdk.start_transaction(name=f"background.{f.__name__}"): - return await f(*args, **kwargs) + with sentry_sdk.isolation_scope(): + with sentry_sdk.start_transaction(name=f"scheduled_tasks.{f.__name__}"): + return await f(*args, **kwargs) return wrapper +def instrument_named_task(name: str): + def decorator(f): + @functools.wraps(f) + async def wrapper(*args, **kwargs): + with sentry_sdk.isolation_scope(): + with sentry_sdk.start_transaction(name=name): + return await f(*args, **kwargs) + + return wrapper + + return decorator + + class AsyncioCancelledErrorFilterEventProcessor: # See https://docs.sentry.io/platforms/python/configuration/filtering/#filtering-error-events def __call__(self, event: Event, hint: Hint) -> Optional[Event]: diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_volumes.py b/src/tests/_internal/server/background/pipeline_tasks/test_volumes.py new file mode 100644 index 0000000000..4d22c59b97 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_volumes.py @@ -0,0 +1,336 @@ +import uuid +from datetime import datetime, timezone +from unittest.mock import Mock, patch + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import BackendError, BackendNotAvailable +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.volumes import VolumeProvisioningData, VolumeStatus +from dstack._internal.server.background.pipeline_tasks.volumes import ( + VolumePipelineItem, + VolumeWorker, +) +from dstack._internal.server.models import VolumeModel +from dstack._internal.server.testing.common import ( + ComputeMockSpec, + create_project, + create_user, + create_volume, + get_volume_configuration, + get_volume_provisioning_data, + list_events, +) + + +@pytest.fixture +def worker() -> VolumeWorker: + return VolumeWorker(queue=Mock(), heartbeater=Mock()) + + +def _volume_to_pipeline_item(volume_model: VolumeModel) -> VolumePipelineItem: + assert volume_model.lock_token is not None + assert volume_model.lock_expires_at is not None + return VolumePipelineItem( + __tablename__=volume_model.__tablename__, + id=volume_model.id, + lock_token=volume_model.lock_token, + lock_expires_at=volume_model.lock_expires_at, + prev_lock_expired=False, + status=volume_model.status, + to_be_deleted=volume_model.to_be_deleted, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestVolumeWorkerSubmitted: + async def test_submitted_to_active(self, test_db, session: AsyncSession, worker: VolumeWorker): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.create_volume.return_value = VolumeProvisioningData( + backend=BackendType.AWS, + volume_id="vol-1234", + size_gb=100, + ) + get_backend_mock.return_value = backend_mock + + await worker.process(_volume_to_pipeline_item(volume)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.create_volume.assert_called_once() + backend_mock.compute.return_value.register_volume.assert_not_called() + + await session.refresh(volume) + assert volume.status == VolumeStatus.ACTIVE + assert volume.volume_provisioning_data is not None + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume status changed SUBMITTED -> ACTIVE" + + async def test_registers_external_volume( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + configuration=get_volume_configuration(volume_id="vol-external-123"), + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.register_volume.return_value = ( + VolumeProvisioningData( + backend=BackendType.AWS, + volume_id="vol-external-123", + size_gb=100, + ) + ) + get_backend_mock.return_value = backend_mock + + await worker.process(_volume_to_pipeline_item(volume)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.register_volume.assert_called_once() + backend_mock.compute.return_value.create_volume.assert_not_called() + + await session.refresh(volume) + assert volume.status == VolumeStatus.ACTIVE + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume status changed SUBMITTED -> ACTIVE" + + async def test_marks_volume_failed_if_backend_not_available( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + get_backend_mock.side_effect = BackendNotAvailable() + await worker.process(_volume_to_pipeline_item(volume)) + get_backend_mock.assert_called_once() + + await session.refresh(volume) + assert volume.status == VolumeStatus.FAILED + assert volume.status_message == "Backend not available" + events = await list_events(session) + assert len(events) == 1 + assert ( + events[0].message + == "Volume status changed SUBMITTED -> FAILED (Backend not available)" + ) + + async def test_marks_volume_failed_if_backend_returns_error( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.create_volume.side_effect = BackendError( + "Some error" + ) + get_backend_mock.return_value = backend_mock + + await worker.process(_volume_to_pipeline_item(volume)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.create_volume.assert_called_once() + + await session.refresh(volume) + assert volume.status == VolumeStatus.FAILED + assert volume.status_message == "Some error" + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume status changed SUBMITTED -> FAILED (Some error)" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestVolumeWorkerDeleted: + async def test_marks_volume_deleted( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + volume_provisioning_data=get_volume_provisioning_data(backend=BackendType.AWS), + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + volume.to_be_deleted = True + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + get_backend_mock.return_value = backend_mock + + await worker.process(_volume_to_pipeline_item(volume)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.delete_volume.assert_called_once() + + await session.refresh(volume) + assert volume.deleted is True + assert volume.deleted_at is not None + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume deleted" + + async def test_marks_external_volume_deleted_without_backend_call( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + configuration=get_volume_configuration(volume_id="vol-external-123"), + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + volume.to_be_deleted = True + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + await worker.process(_volume_to_pipeline_item(volume)) + get_backend_mock.assert_not_called() + + await session.refresh(volume) + assert volume.deleted is True + assert volume.deleted_at is not None + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume deleted" + + async def test_marks_volume_deleted_if_backend_not_available( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + volume_provisioning_data=get_volume_provisioning_data(backend=BackendType.AWS), + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + volume.to_be_deleted = True + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + get_backend_mock.side_effect = BackendNotAvailable() + await worker.process(_volume_to_pipeline_item(volume)) + get_backend_mock.assert_called_once() + + await session.refresh(volume) + assert volume.deleted is True + assert volume.deleted_at is not None + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume deleted" + + async def test_marks_volume_deleted_if_backend_delete_errors( + self, test_db, session: AsyncSession, worker: VolumeWorker + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + volume_provisioning_data=get_volume_provisioning_data(backend=BackendType.AWS), + ) + volume.lock_token = uuid.uuid4() + volume.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + volume.to_be_deleted = True + await session.commit() + + with patch( + "dstack._internal.server.background.pipeline_tasks.volumes.backends_services.get_project_backend_by_type_or_error" + ) as get_backend_mock: + backend_mock = Mock() + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.delete_volume.side_effect = BackendError( + "Delete failed" + ) + get_backend_mock.return_value = backend_mock + + await worker.process(_volume_to_pipeline_item(volume)) + + get_backend_mock.assert_called_once() + backend_mock.compute.return_value.delete_volume.assert_called_once() + + await session.refresh(volume) + assert volume.deleted is True + assert volume.deleted_at is not None + events = await list_events(session) + assert len(events) == 1 + assert events[0].message == "Volume deleted" diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_idle_volumes.py b/src/tests/_internal/server/background/scheduled_tasks/test_idle_volumes.py index 6a7acf0c43..5bf844feea 100644 --- a/src/tests/_internal/server/background/scheduled_tasks/test_idle_volumes.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_idle_volumes.py @@ -22,12 +22,25 @@ get_volume_provisioning_data, list_events, ) +from dstack._internal.settings import FeatureFlags from dstack._internal.utils.common import get_current_datetime +@pytest.fixture +def patch_pipeline_processing_flag(monkeypatch: pytest.MonkeyPatch): + def _apply(enabled: bool): + monkeypatch.setattr(FeatureFlags, "PIPELINE_PROCESSING_ENABLED", enabled) + + return _apply + + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) -class TestProcessIdleVolumes: +class TestProcessIdleVolumesScheduledTask: + @pytest.fixture(autouse=True) + def _patch_feature_flag(self, patch_pipeline_processing_flag): + patch_pipeline_processing_flag(False) + async def test_deletes_idle_volumes(self, test_db, session: AsyncSession): project = await create_project(session=session) user = await create_user(session=session) @@ -71,17 +84,169 @@ async def test_deletes_idle_volumes(self, test_db, session: AsyncSession): m.return_value = aws_mock aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) await process_idle_volumes() + m.assert_called_once() await session.refresh(volume1) await session.refresh(volume2) events = await list_events(session) + assert not volume1.to_be_deleted assert volume1.deleted assert volume1.deleted_at is not None + assert not volume2.to_be_deleted assert not volume2.deleted assert volume2.deleted_at is None assert len(events) == 1 assert events[0].message == "Volume deleted due to exceeding auto_cleanup_duration" + async def test_deletes_idle_volume_with_null_auto_cleanup_enabled( + self, test_db, session: AsyncSession + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=get_volume_configuration( + name="test-volume", + auto_cleanup_duration="1h", + ), + volume_provisioning_data=get_volume_provisioning_data(), + last_job_processed_at=datetime.datetime.now(datetime.timezone.utc) + - datetime.timedelta(hours=2), + ) + volume.auto_cleanup_enabled = None + await session.commit() + + with patch( + "dstack._internal.server.services.backends.get_project_backend_by_type_or_error" + ) as m: + aws_mock = Mock() + m.return_value = aws_mock + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + await process_idle_volumes() + m.assert_called_once() + + await session.refresh(volume) + events = await list_events(session) + assert not volume.to_be_deleted + assert volume.deleted + assert volume.deleted_at is not None + assert len(events) == 1 + assert events[0].message == "Volume deleted due to exceeding auto_cleanup_duration" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestProcessIdleVolumesPipelineTask: + @pytest.fixture(autouse=True) + def _patch_feature_flag(self, patch_pipeline_processing_flag): + patch_pipeline_processing_flag(True) + + async def test_deletes_idle_volumes(self, test_db, session: AsyncSession): + project = await create_project(session=session) + user = await create_user(session=session) + + config1 = get_volume_configuration( + name="test-volume", + auto_cleanup_duration="1h", + ) + config2 = get_volume_configuration( + name="test-volume", + auto_cleanup_duration="3h", + ) + volume1 = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=config1, + volume_provisioning_data=get_volume_provisioning_data(), + last_job_processed_at=datetime.datetime.now(datetime.timezone.utc) + - datetime.timedelta(hours=2), + ) + volume2 = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=config2, + volume_provisioning_data=get_volume_provisioning_data(), + last_job_processed_at=datetime.datetime.now(datetime.timezone.utc) + - datetime.timedelta(hours=2), + ) + await session.commit() + + with patch( + "dstack._internal.server.services.backends.get_project_backend_by_type_or_error" + ) as m: + aws_mock = Mock() + m.return_value = aws_mock + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + await process_idle_volumes() + m.assert_not_called() + + await session.refresh(volume1) + await session.refresh(volume2) + events = await list_events(session) + assert volume1.to_be_deleted + assert not volume1.deleted + assert volume1.deleted_at is None + assert not volume2.to_be_deleted + assert not volume2.deleted + assert volume2.deleted_at is None + assert len(events) == 1 + assert ( + events[0].message + == "Volume marked for deletion due to exceeding auto_cleanup_duration" + ) + + async def test_deletes_idle_volume_with_null_auto_cleanup_enabled( + self, test_db, session: AsyncSession + ): + project = await create_project(session=session) + user = await create_user(session=session) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + backend=BackendType.AWS, + configuration=get_volume_configuration( + name="test-volume", + auto_cleanup_duration="1h", + ), + volume_provisioning_data=get_volume_provisioning_data(), + last_job_processed_at=datetime.datetime.now(datetime.timezone.utc) + - datetime.timedelta(hours=2), + ) + volume.auto_cleanup_enabled = None + await session.commit() + + with patch( + "dstack._internal.server.services.backends.get_project_backend_by_type_or_error" + ) as m: + aws_mock = Mock() + m.return_value = aws_mock + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + await process_idle_volumes() + m.assert_not_called() + + await session.refresh(volume) + events = await list_events(session) + assert volume.to_be_deleted + assert not volume.deleted + assert volume.deleted_at is None + assert len(events) == 1 + assert ( + events[0].message + == "Volume marked for deletion due to exceeding auto_cleanup_duration" + ) + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_submitted_jobs.py b/src/tests/_internal/server/background/scheduled_tasks/test_submitted_jobs.py index b06eb50ec2..f33f608c71 100644 --- a/src/tests/_internal/server/background/scheduled_tasks/test_submitted_jobs.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_submitted_jobs.py @@ -492,6 +492,79 @@ async def test_assigns_job_to_instance_with_volumes(self, test_db, session: Asyn ) assert job.instance.volume_attachments[0].volume == volume + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_fails_job_when_attaching_volume_marked_for_deletion( + self, test_db, session: AsyncSession + ): + project = await create_project(session) + user = await create_user(session) + repo = await create_repo( + session=session, + project_id=project.id, + ) + volume = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + volume_provisioning_data=get_volume_provisioning_data(), + backend=BackendType.AWS, + region="us-east-1", + ) + volume.to_be_deleted = True + await session.commit() + fleet = await create_fleet(session=session, project=project) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + backend=BackendType.AWS, + region="us-east-1", + ) + run_spec = get_run_spec(run_name="test-run", repo_id=repo.name) + run_spec.configuration.volumes = [VolumeMountPoint(name=volume.name, path="/volume")] + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + run_name="test-run", + run_spec=run_spec, + ) + job = await create_job( + session=session, + run=run, + instance_assigned=False, + ) + + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + backend_mock = Mock() + m.return_value = backend_mock + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.attach_volume.return_value = VolumeAttachmentData() + # Submitted jobs processing happens in two steps + await process_submitted_jobs() + await process_submitted_jobs() + backend_mock.compute.return_value.attach_volume.assert_not_called() + + await session.refresh(job) + res = await session.execute( + select(JobModel).options( + joinedload(JobModel.instance) + .joinedload(InstanceModel.volume_attachments) + .joinedload(VolumeAttachmentModel.volume) + ) + ) + job = res.unique().scalar_one() + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.VOLUME_ERROR + assert job.termination_reason_message is not None + assert "marked for deletion and cannot be attached" in job.termination_reason_message + assert job.instance is None + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_assigns_job_to_shared_instance(self, test_db, session: AsyncSession): diff --git a/src/tests/_internal/server/services/test_volumes.py b/src/tests/_internal/server/services/test_volumes.py index 4de9c3f050..6bfb9bae66 100644 --- a/src/tests/_internal/server/services/test_volumes.py +++ b/src/tests/_internal/server/services/test_volumes.py @@ -10,7 +10,10 @@ _get_volume_cost, _validate_volume_configuration, ) -from dstack._internal.server.testing.common import get_volume, get_volume_provisioning_data +from dstack._internal.server.testing.common import ( + get_volume, + get_volume_provisioning_data, +) class TestValidateVolumeConfiguration: From 697600c169309477ebe4890a02f3e660516a0391 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 25 Feb 2026 12:43:35 +0100 Subject: [PATCH 164/187] [Website] Minor edits (#3609) --- docs/assets/stylesheets/extra.css | 36 ++++++++++++----------------- docs/assets/stylesheets/landing.css | 2 +- docs/overrides/header-2.html | 15 ++++++++++-- docs/overrides/home.html | 6 ++--- docs/overrides/main.html | 6 ++--- mkdocs.yml | 2 +- 6 files changed, 36 insertions(+), 31 deletions(-) diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index 7939548fed..43f740a162 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -1272,37 +1272,30 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { padding-right: 22px; } - .md-tabs__item:nth-child(1) { + .md-tabs__item:nth-child(1), .md-tabs__item:nth-child(4) { display: none; } - /* .md-tabs__item:nth-child(5):after { - content: url('data:image/svg+xml,'); - line-height: 14px; - margin-left: 4px; - position: relative; - top: 16px; - margin-right: -7px; - } */ - /*.md-tabs__item:nth-child(6) { + + .md-tabs__item:nth-child(7) { margin-left: auto; - padding-right: 0.8rem; + padding-right: 0.5rem; } - .md-tabs__item:nth-child(n+6) .md-tabs__link { + .md-tabs__item:nth-child(n+7) .md-tabs__link { visibility: hidden; width: 35px; display: inline-block; - margin-top: 10px; + margin-top: 12px; } - .md-tabs__item:nth-child(n+6) .md-tabs__link:before { + .md-tabs__item:nth-child(n+7) .md-tabs__link:before { width: 38px; height: 38px; margin-top: 4px; visibility: visible; - }*/ + } /* .twemoji.external { position: relative; @@ -1320,19 +1313,20 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { margin-right: -7px; } */ - /*.md-tabs__item:nth-child(6) .md-tabs__link:before { + .md-tabs__item:nth-child(7) .md-tabs__link:before { position: relative; content: ''; - width: 37px; - height: 31px; + width: 34px; + height: 28px; display: inline-block; -webkit-mask: url('data:image/svg+xml,') no-repeat 50% 50%; mask: url('data:image/svg+xml,') no-repeat 50% 50%; -webkit-mask-size: cover; mask-size: cover; - background: -webkit-linear-gradient(45deg, #0048ff, #ce00ff); + background-color: rgba(0,0,0,0.87); + /* background: -webkit-linear-gradient(45deg, #0048ff, #ce00ff); */ margin-top: 1px; - }*/ + } .md-tabs__link { display: flex; @@ -1340,7 +1334,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) { } .md-tabs__link { - font-size: 0.85rem; + font-size: 0.88rem; font-weight: 500; color: rgba(0,0,0,0.87); /*letter-spacing: -0.5px;*/ diff --git a/docs/assets/stylesheets/landing.css b/docs/assets/stylesheets/landing.css index 1ebb351b90..824b0c632a 100644 --- a/docs/assets/stylesheets/landing.css +++ b/docs/assets/stylesheets/landing.css @@ -1135,7 +1135,7 @@ } @media screen and (max-width: 44.984375em) { - .md-header__buttons .md-button:before, .md-header__buttons .md-button:after { + .md-header__buttons .md-button:not(.github):before, .md-header__buttons .md-button:not(.github):after { display: none !important; } diff --git a/docs/overrides/header-2.html b/docs/overrides/header-2.html index dbcb7ad9ae..6310a155c6 100644 --- a/docs/overrides/header-2.html +++ b/docs/overrides/header-2.html @@ -61,8 +61,19 @@ {% endif %}-->
    - GitHub - dstack Sky + GitHub + + dstack Sky
    {% if "navigation.tabs.sticky" in features %} diff --git a/docs/overrides/home.html b/docs/overrides/home.html index 017453a67d..05f5468198 100644 --- a/docs/overrides/home.html +++ b/docs/overrides/home.html @@ -64,8 +64,8 @@

    Open-source - Get started - + Install open-source + Get started in minutes

    class="md-button md-button--primary" style="margin-right: 10px"> Quickstart - + GitHub +Services now support native Prefill-Decode disaggregation {% endblock %} {% block footer %} @@ -154,7 +153,7 @@ @@ -164,6 +163,7 @@ Terms Privacy + Blog
    diff --git a/mkdocs.yml b/mkdocs.yml index ef745e6548..1f93084515 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -315,7 +315,7 @@ nav: - blog/index.md - Case studies: blog/case-studies.md - Benchmarks: blog/benchmarks.md -# - Discord: https://discord.gg/u8SmfwPpMd" target="_blank + - Discord: https://discord.gg/u8SmfwPpMd" target="_blank # - Changelog: https://github.com/dstackai/dstack/releases" target="_blank # - GitHub: https://github.com/dstackai/dstack" target="_blank # - Sign in: https://sky.dstack.ai" target="_blank From f934bb43484fbcf7ae136d1fddf4c42c1efe10ef Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 25 Feb 2026 14:53:29 +0100 Subject: [PATCH 165/187] Add robots.txt and structured data for SEO (#3610) Add docs/robots.txt pointing crawlers to the sitemap. Add JSON-LD structured data to main.html: WebSite + Organization schema on the homepage (helps Google show sitelinks and knowledge panel), and BreadcrumbList schema on all other pages (helps Google show breadcrumb trails in search results). Also update the Twitter social link to x.com. AI Assistance: Claude Co-authored-by: Cursor --- docs/overrides/main.html | 79 ++++++++++++++++++++++++++++++++++++++++ docs/robots.txt | 4 ++ mkdocs.yml | 2 +- 3 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 docs/robots.txt diff --git a/docs/overrides/main.html b/docs/overrides/main.html index ed5d28a8c3..000ff9bec1 100644 --- a/docs/overrides/main.html +++ b/docs/overrides/main.html @@ -5,6 +5,85 @@ +{# + Structured data (JSON-LD) for SEO. + - Homepage gets WebSite + Organization schema (helps Google show sitelinks and knowledge panel). + - All other pages get BreadcrumbList schema (helps Google show breadcrumb trails in results). + Breadcrumb URLs are resolved via _find_leaf_url because MkDocs nav sections don't have + their own URLs — we use the first descendant page's URL as a proxy. + Dedup by title to collapse nav levels duplicated by plugins (e.g. the blog plugin nests + "Blog" inside "Blog"). The current page is omitted when its title matches the last + ancestor (e.g. /examples/ is both the "Examples" section index and the page itself). +#} +{% macro _find_leaf_url(nav_item) -%} + {%- if nav_item.url -%} + /{{ nav_item.url }} + {%- elif nav_item.children -%} + {{ _find_leaf_url(nav_item.children | first) }} + {%- endif -%} +{%- endmacro %} +{% if page.is_homepage %} + +{% elif page.ancestors | length > 0 %} + +{% endif %} {% endblock %} {% block container %} diff --git a/docs/robots.txt b/docs/robots.txt new file mode 100644 index 0000000000..b38b74b551 --- /dev/null +++ b/docs/robots.txt @@ -0,0 +1,4 @@ +User-agent: * +Allow: / + +Sitemap: https://dstack.ai/sitemap.xml diff --git a/mkdocs.yml b/mkdocs.yml index 1f93084515..58c176f3db 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -212,7 +212,7 @@ extra: - icon: /fontawesome/brands/discord link: https://discord.gg/u8SmfwPpMd - icon: /simple/x - link: https://twitter.com/dstackai + link: https://x.com/dstackai - icon: /fontawesome/brands/linkedin link: https://www.linkedin.com/company/dstackai status: From 80f9c39d45d7f098fb437bd38e8009b3dba401cb Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 25 Feb 2026 23:20:49 +0100 Subject: [PATCH 166/187] Add templates API and launch wizard UI (#3605) * Add server API for UI templates Add POST /api/project/{project_name}/templates/list endpoint that serves UI templates from an external git repo configured via DSTACK_SERVER_TEMPLATES_REPO. Templates are YAML files under .dstack/templates/ in the repo, parsed into typed pydantic models with a discriminated union for parameter types. Results are cached with a 3-minute TTL using cachetools.TTLCache. Currently returns only server-wide templates; project-specific templates will be added in a future iteration. Co-authored-by: Cursor * Fix auth test to accept both 401 and 403 Match codebase pattern for version-dependent HTTPBearer status codes. Co-authored-by: Cursor * Choice project and template * Implemented supporting templates for run env * Refactoring templates after review: Added default project Template Choice as cards Default spot-policy filter for offers Fix small defects * Remove log * Update templates API and UI to match new schema Align backend and frontend with the updated template format: type "template" (was "ui-template"), name (was id), description (new), configuration (was template). Improve the launch wizard with proper env parameter handling ($random-password detection, copy-before-proceed validation, password input, info panel), clickable template cards with description and type sections, and renamed routes/labels. Co-authored-by: Cursor * Improve Configuration step with info panel and label Replace inline description with a short label, description text, and an Info link that opens a help panel with Cloudscape-styled external links to dev environments, tasks, and services concept pages. Co-authored-by: Cursor * Add GPU toggle, env password handling, and wizard polish Resources step: GPU toggle (off by default) inside OfferList header with disabled prop to skip API/hide content when off. Generates resources.gpu:0 when disabled, passes backends only from user filter. Env params: respect $random-password, password input with copy-before- proceed validation, info panel, dynamic env variable name in YAML. Also: fix useFilters permanentFilters dep, add onChangeBackendFilter callback, FormToggle errorText support, configuration info panel with concept links. Co-authored-by: Cursor * Rename CreateDevEnvironment to Launch Rename directory, component, localStorage key, translation keys, and all references from the old dev-environment-specific naming to the generalized Launch naming. Co-authored-by: Cursor * Add Endpoint section for service runs on the run details page Show service readiness status with waiting states and the endpoint URL once the service is up and probes (if configured) have passed. Replaces the inline service URL in the General section. Co-authored-by: Cursor * Fix YAML generation and resource parsing for templates - Disable line wrapping in jsYaml.dump to preserve multi-line command formatting from templates - Handle numeric and object resource values (e.g. gpu: 0) in addition to string shorthand - Show empty placeholder for disabled offer cards Co-authored-by: Cursor --------- Co-authored-by: Cursor Co-authored-by: Oleg Vavilov --- frontend/src/api.ts | 2 + frontend/src/components/form/Input/index.tsx | 2 + frontend/src/components/form/Input/types.ts | 2 +- frontend/src/components/form/Select/index.tsx | 2 + frontend/src/components/form/Select/types.ts | 2 +- frontend/src/components/form/Toogle/index.tsx | 5 +- frontend/src/components/form/Toogle/types.ts | 4 +- frontend/src/hooks/useProjectFilter.ts | 3 +- frontend/src/libs/filters.ts | 23 +- frontend/src/libs/index.ts | 1 + frontend/src/libs/password.ts | 106 ++ frontend/src/locale/en.json | 1493 +++++++++-------- .../src/pages/Offers/List/hooks/useFilters.ts | 96 +- frontend/src/pages/Offers/List/index.tsx | 42 +- .../Runs/CreateDevEnvironment/constants.tsx | 50 - .../hooks/useGenerateYaml.ts | 52 - .../pages/Runs/CreateDevEnvironment/index.tsx | 457 ----- .../index.tsx | 2 +- .../RunDetails/ConnectToServiceRun/index.tsx | 47 + .../pages/Runs/Details/RunDetails/index.tsx | 18 +- .../components/ParamsWizardStep/index.tsx | 285 ++++ frontend/src/pages/Runs/Launch/constants.tsx | 93 + .../getRunSpecConfigurationResources.ts | 36 +- .../Runs/Launch/hooks/useGenerateYaml.ts | 64 + .../hooks/useGetRunSpecFromYaml.ts | 21 +- .../Launch/hooks/useValidationResolver.ts | 123 ++ frontend/src/pages/Runs/Launch/index.tsx | 389 +++++ .../styles.module.scss | 0 .../{CreateDevEnvironment => Launch}/types.ts | 7 +- frontend/src/pages/Runs/List/index.tsx | 32 +- frontend/src/pages/Runs/index.ts | 2 +- frontend/src/router.tsx | 4 +- frontend/src/routes.ts | 2 +- frontend/src/services/templates.ts | 32 + frontend/src/store.ts | 3 + frontend/src/types/run.d.ts | 100 +- frontend/src/types/template.d.ts | 17 + src/dstack/_internal/core/models/templates.py | 71 + src/dstack/_internal/server/app.py | 2 + .../_internal/server/routers/templates.py | 21 + .../_internal/server/services/templates.py | 99 ++ src/dstack/_internal/server/settings.py | 2 + .../_internal/core/models/test_templates.py | 264 +++ .../server/routers/test_templates.py | 118 ++ .../server/services/test_templates.py | 286 ++++ 45 files changed, 3035 insertions(+), 1447 deletions(-) create mode 100644 frontend/src/libs/password.ts delete mode 100644 frontend/src/pages/Runs/CreateDevEnvironment/constants.tsx delete mode 100644 frontend/src/pages/Runs/CreateDevEnvironment/hooks/useGenerateYaml.ts delete mode 100644 frontend/src/pages/Runs/CreateDevEnvironment/index.tsx create mode 100644 frontend/src/pages/Runs/Details/RunDetails/ConnectToServiceRun/index.tsx create mode 100644 frontend/src/pages/Runs/Launch/components/ParamsWizardStep/index.tsx create mode 100644 frontend/src/pages/Runs/Launch/constants.tsx rename frontend/src/pages/Runs/{CreateDevEnvironment => Launch}/helpers/getRunSpecConfigurationResources.ts (68%) create mode 100644 frontend/src/pages/Runs/Launch/hooks/useGenerateYaml.ts rename frontend/src/pages/Runs/{CreateDevEnvironment => Launch}/hooks/useGetRunSpecFromYaml.ts (90%) create mode 100644 frontend/src/pages/Runs/Launch/hooks/useValidationResolver.ts create mode 100644 frontend/src/pages/Runs/Launch/index.tsx rename frontend/src/pages/Runs/{CreateDevEnvironment => Launch}/styles.module.scss (100%) rename frontend/src/pages/Runs/{CreateDevEnvironment => Launch}/types.ts (64%) create mode 100644 frontend/src/services/templates.ts create mode 100644 frontend/src/types/template.d.ts create mode 100644 src/dstack/_internal/core/models/templates.py create mode 100644 src/dstack/_internal/server/routers/templates.py create mode 100644 src/dstack/_internal/server/services/templates.py create mode 100644 src/tests/_internal/core/models/test_templates.py create mode 100644 src/tests/_internal/server/routers/test_templates.py create mode 100644 src/tests/_internal/server/services/test_templates.py diff --git a/frontend/src/api.ts b/frontend/src/api.ts index 144a21bc86..f7c9e20d51 100644 --- a/frontend/src/api.ts +++ b/frontend/src/api.ts @@ -119,6 +119,8 @@ export const API = { SECRETS_DELETE: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/secrets/delete`, // GPUS GPUS_LIST: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/gpus/list`, + // GPUS + TEMPLATES_LIST: (projectName: IProject['project_name']) => `${API.BASE()}/project/${projectName}/templates/list`, }, BACKENDS: { diff --git a/frontend/src/components/form/Input/index.tsx b/frontend/src/components/form/Input/index.tsx index 784de33e9b..b0c745b105 100644 --- a/frontend/src/components/form/Input/index.tsx +++ b/frontend/src/components/form/Input/index.tsx @@ -10,6 +10,7 @@ export const FormInput = ({ name, control, rules, + defaultValue, label, info, constraintText, @@ -30,6 +31,7 @@ export const FormInput = ({ name={name} control={control} rules={rules} + defaultValue={defaultValue} render={({ field: { onChange, ...fieldRest }, fieldState: { error } }) => { return ( = Omit & Omit & - Pick, 'control' | 'name' | 'rules'> & { + Pick, 'control' | 'name' | 'rules' | 'defaultValue'> & { leftContent?: ReactNode; hotspotId?: string; }; diff --git a/frontend/src/components/form/Select/index.tsx b/frontend/src/components/form/Select/index.tsx index b1b5f9b2ea..77dee60e11 100644 --- a/frontend/src/components/form/Select/index.tsx +++ b/frontend/src/components/form/Select/index.tsx @@ -9,6 +9,7 @@ import { FormSelectProps } from './types'; export const FormSelect = ({ name, rules, + defaultValue, control, label, info, @@ -24,6 +25,7 @@ export const FormSelect = ({ name={name} control={control} rules={rules} + defaultValue={defaultValue} render={({ field: { onChange, ...fieldRest }, fieldState: { error } }) => { const selectedOption = props.options?.find((i) => i.value === fieldRest.value) ?? null; diff --git a/frontend/src/components/form/Select/types.ts b/frontend/src/components/form/Select/types.ts index d10170c02b..91bfd32427 100644 --- a/frontend/src/components/form/Select/types.ts +++ b/frontend/src/components/form/Select/types.ts @@ -7,6 +7,6 @@ export type FormSelectOptions = ReadonlyArray; export type FormSelectProps = Omit & Omit & - Pick, 'control' | 'name' | 'rules'> & { + Pick, 'control' | 'name' | 'rules' | 'defaultValue'> & { options: ReadonlyArray; }; diff --git a/frontend/src/components/form/Toogle/index.tsx b/frontend/src/components/form/Toogle/index.tsx index d8909e48e8..1cdb3c62ad 100644 --- a/frontend/src/components/form/Toogle/index.tsx +++ b/frontend/src/components/form/Toogle/index.tsx @@ -11,6 +11,7 @@ export const FormToggle = ({ name, control, rules, + defaultValue, label, info, constraintText, @@ -22,6 +23,7 @@ export const FormToggle = ({ onChange: onChangeProp, toggleDescription, toggleInfo, + errorText: externalErrorText, ...props }: FormToggleProps) => { return ( @@ -29,6 +31,7 @@ export const FormToggle = ({ name={name} control={control} rules={rules} + defaultValue={defaultValue} render={({ field: { onChange, value, ...fieldRest }, fieldState: { error } }) => { return ( ({ stretch={stretch} constraintText={constraintText} secondaryControl={secondaryControl} - errorText={error?.message} + errorText={error?.message || externalErrorText} > {leftContent} diff --git a/frontend/src/components/form/Toogle/types.ts b/frontend/src/components/form/Toogle/types.ts index 7170922894..bc037c23f7 100644 --- a/frontend/src/components/form/Toogle/types.ts +++ b/frontend/src/components/form/Toogle/types.ts @@ -4,8 +4,8 @@ import { FormFieldProps } from '@cloudscape-design/components/form-field'; import { ToggleProps } from '@cloudscape-design/components/toggle'; export type FormToggleProps = Omit & - Omit & - Pick, 'control' | 'name' | 'rules'> & { + FormFieldProps & + Pick, 'control' | 'name' | 'rules' | 'defaultValue'> & { toggleDescription?: ReactNode; leftContent?: ReactNode; toggleLabel?: ReactNode | string; diff --git a/frontend/src/hooks/useProjectFilter.ts b/frontend/src/hooks/useProjectFilter.ts index 58e573d31d..5c54fd28e0 100644 --- a/frontend/src/hooks/useProjectFilter.ts +++ b/frontend/src/hooks/useProjectFilter.ts @@ -16,7 +16,7 @@ export const useProjectFilter = ({ localStorePrefix }: Args) => { null, ); - const { data: projectsData } = useGetProjectsQuery({}); + const { data: projectsData, isLoading } = useGetProjectsQuery({}); const projectOptions = useMemo(() => { if (!projectsData?.data?.length) return []; @@ -40,5 +40,6 @@ export const useProjectFilter = ({ localStorePrefix }: Args) => { projectOptions, selectedProject, setSelectedProject, + isLoadingProjectOptions: isLoading, } as const; }; diff --git a/frontend/src/libs/filters.ts b/frontend/src/libs/filters.ts index 7546f8d821..8690734dd9 100644 --- a/frontend/src/libs/filters.ts +++ b/frontend/src/libs/filters.ts @@ -78,16 +78,37 @@ export const EMPTY_QUERY: PropertyFilterProps.Query = { export const requestParamsToTokens = ({ searchParams, filterKeys, + defaultFilterValues, }: { searchParams: URLSearchParams; filterKeys: Record; + defaultFilterValues?: Partial>; }): PropertyFilterProps.Query => { const tokens = []; + const filterKeysValues = Object.values(filterKeys); + + if (defaultFilterValues) { + Object.keys(defaultFilterValues).forEach((defaultFilterKey) => { + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + const defaultFilterValue: string[] = Array.isArray(defaultFilterValues[defaultFilterKey]) + ? // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + defaultFilterValues[defaultFilterKey] + : // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-expect-error + [defaultFilterValues[defaultFilterKey]]; + + defaultFilterValue.forEach((value) => { + tokens.push({ propertyKey: defaultFilterKey, operator: '=', value: value }); + }); + }); + } // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore for (const [paramKey, paramValue] of searchParams.entries()) { - if (Object.values(filterKeys).includes(paramKey)) { + if (filterKeysValues.includes(paramKey)) { tokens.push({ propertyKey: paramKey, operator: '=', value: paramValue }); } } diff --git a/frontend/src/libs/index.ts b/frontend/src/libs/index.ts index 523923ab4b..4d5123787c 100644 --- a/frontend/src/libs/index.ts +++ b/frontend/src/libs/index.ts @@ -5,6 +5,7 @@ export { getServerError, } from './serverErrors'; import { format, formatDistanceToNowStrict } from 'date-fns'; +export { generateSecurePassword, generatePassword, generateSimplePassword } from './password'; // eslint-disable-next-line @typescript-eslint/no-explicit-any export function arrayToRecordByKeyName(array: T[], selector: K) { diff --git a/frontend/src/libs/password.ts b/frontend/src/libs/password.ts new file mode 100644 index 0000000000..e430cdcf15 --- /dev/null +++ b/frontend/src/libs/password.ts @@ -0,0 +1,106 @@ +const UPPERCASE_LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; +const LOWERCASE_LETTERS = 'abcdefghijklmnopqrstuvwxyz'; +const NUMBERS = '0123456789'; +const SPECIAL_CHARACTERS = '@#$^_+-'; + +interface PasswordOptions { + length: number; + includeUppercase?: boolean; + includeLowercase?: boolean; + includeNumbers?: boolean; + includeSpecial?: boolean; +} +function generatePassword(options: PasswordOptions): string { + const { length, includeUppercase = true, includeLowercase = true, includeNumbers = true, includeSpecial = true } = options; + + let allowedChars = ''; + + if (includeUppercase) allowedChars += UPPERCASE_LETTERS; + if (includeLowercase) allowedChars += LOWERCASE_LETTERS; + if (includeNumbers) allowedChars += NUMBERS; + if (includeSpecial) allowedChars += SPECIAL_CHARACTERS; + + if (allowedChars.length === 0) { + throw new Error('No character type is selected for the password'); + } + + if (length < 4) { + throw new Error('The password must be at least 4 characters long'); + } + + let password = ''; + const randomValues = new Uint32Array(length); + + crypto.getRandomValues(randomValues); + + for (let i = 0; i < length; i++) { + const randomIndex = randomValues[i] % allowedChars.length; + password += allowedChars[randomIndex]; + } + + return password; +} + +function generateSimplePassword(length: number): string { + const ALL_CHARS = UPPERCASE_LETTERS + LOWERCASE_LETTERS + NUMBERS + SPECIAL_CHARACTERS; + + if (length < 1) { + throw new Error('The password length must be a positive number'); + } + + let password = ''; + const randomValues = new Uint32Array(length); + + crypto.getRandomValues(randomValues); + + for (let i = 0; i < length; i++) { + const randomIndex = randomValues[i] % ALL_CHARS.length; + password += ALL_CHARS[randomIndex]; + } + + return password; +} + +function generateSecurePassword(length: number): string { + if (length < 4) { + throw new Error('The minimum length for a secure password is 4 characters'); + } + + const charSets = [UPPERCASE_LETTERS, LOWERCASE_LETTERS, NUMBERS, SPECIAL_CHARACTERS]; + + let password = ''; + password += UPPERCASE_LETTERS[Math.floor(Math.random() * UPPERCASE_LETTERS.length)]; + password += LOWERCASE_LETTERS[Math.floor(Math.random() * LOWERCASE_LETTERS.length)]; + password += NUMBERS[Math.floor(Math.random() * NUMBERS.length)]; + password += SPECIAL_CHARACTERS[Math.floor(Math.random() * SPECIAL_CHARACTERS.length)]; + + const ALL_CHARS = charSets.join(''); + const remainingLength = length - 4; + + if (remainingLength > 0) { + const randomValues = new Uint32Array(remainingLength); + crypto.getRandomValues(randomValues); + + for (let i = 0; i < remainingLength; i++) { + const randomIndex = randomValues[i] % ALL_CHARS.length; + password += ALL_CHARS[randomIndex]; + } + } + + return password + .split('') + .sort(() => Math.random() - 0.5) + .join(''); +} + +export { + generatePassword, + generateSimplePassword, + generateSecurePassword, + UPPERCASE_LETTERS, + LOWERCASE_LETTERS, + NUMBERS, + SPECIAL_CHARACTERS, +}; + +export type { PasswordOptions }; diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index 56c5e2efce..0b6bbc08c9 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -1,774 +1,787 @@ { - "dstack": "Dstack", - "common": { - "ok": "OK", - "loading": "Loading", - "add": "Add", - "yes": "Yes", - "no": "No", - "create": "Create", - "create_wit_text": "Create {{text}}", - "edit": "Edit", - "delete": "Delete", - "remove": "Remove", - "apply": "Apply", - "next": "Next", - "previous": "Back", - "settings": "Settings", - "match_count_with_value_one": "{{count}} match", - "match_count_with_value_other": "{{count}} matches", - "nomatch_message_title": "No matches", - "nomatch_message_text": "We can't find a match.", - "sign_out": "Sign out", - "cancel": "Cancel", - "save": "Save", - "send" : "Send", - "profile": "Profile", - "copied": "Copied", - "copy": "Copy", - "info": "Info", - "stop": "Stop", - "abort": "Abort", - "close": "Close", - "clearFilter": "Clear filter", - "server_error": "Server error: {{error}}", - "login": "Sign in", - "login_github": "Sign in with GitHub", - "login_okta": "Sign in with Okta", - "login_entra": "Sign in with EntraID", - "login_google": "Sign in with Google", - "general": "General", - "test": "Test", - "local_storage_unavailable": "Local Storage is unavailable", - "local_storage_unavailable_message": "Your browser doesn't support local storage", - "object": "Object", - "objects_other": "Objects", - "continue": "Continue", - "select_visible_columns": "Select visible columns", - "tutorial": "Tutorials", - "tutorial_other": "Take a tour", - "docs": "Docs", - "discord": "Discord", - "danger_zone": "Danger Zone", - "control_plane": "Control plane", - "refresh": "Refresh", - "quickstart": "Quickstart", - "ask_ai": "Ask AI", - "new": "New", - "full_view": "Full view" - }, - - "auth": { - "invalid_token": "Invalid token", - "you_are_not_logged_in": "You are not logged in", - "contact_to_administrator": "For getting the authorization token, contact to the administrator", - "sign_in_to_dstack": "Welcome to dstack Sky", - "sign_in_to_dstack_enterprise": "Welcome to dstack", - "authorization_failed": "Authorization is failed", - "try_again": "Please try again", - "login_by_token": "Sign in via a token", - "another_login_methods": "Other sign in options" - }, - - "navigation": { - "settings": "Settings", - "runs": "Runs", - "models": "Models", - "fleets": "Fleets", - "fleet": "Fleet", - "project": "project", - "project_other": "Projects", - "general": "General", - "users": "Users", - "user_settings": "User settings", - "account": "User", - "billing": "Billing", - "resources": "Resources", - "volumes": "Volumes", - "instances": "Instances", - "offers": "Offers", - "events": "Events" - }, - - "backend": { - "page_title_one": "Backend", - "page_title_other": "Backends", - "add_backend": "Add backend", - "edit_backend": "Edit backend", - "empty_message_title": "No backends", - "empty_message_text": "No backends to display.", - "type": { - "aws": "AWS", - "aws_description": "Run workflows and store data in Amazon Web Services ", - "gcp": "GCP", - "gcp_description": "Run workflows and store data in Google Cloud Platform", - "azure": "Azure", - "azure_description": "Run workflows and store data in Microsoft Azure", - "lambda": "Lambda", - "lambda_description": "Run workflows and store data in Lambda", - "local": "Local", - "local_description": "Run workflows and store data locally via Docker" + "dstack": "Dstack", + "common": { + "ok": "OK", + "loading": "Loading", + "add": "Add", + "yes": "Yes", + "no": "No", + "create": "Create", + "create_wit_text": "Create {{text}}", + "edit": "Edit", + "delete": "Delete", + "remove": "Remove", + "apply": "Apply", + "next": "Next", + "previous": "Back", + "settings": "Settings", + "match_count_with_value_one": "{{count}} match", + "match_count_with_value_other": "{{count}} matches", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "sign_out": "Sign out", + "cancel": "Cancel", + "save": "Save", + "send": "Send", + "profile": "Profile", + "copied": "Copied", + "copy": "Copy", + "info": "Info", + "stop": "Stop", + "abort": "Abort", + "close": "Close", + "clearFilter": "Clear filter", + "server_error": "Server error: {{error}}", + "login": "Sign in", + "login_github": "Sign in with GitHub", + "login_okta": "Sign in with Okta", + "login_entra": "Sign in with EntraID", + "login_google": "Sign in with Google", + "general": "General", + "test": "Test", + "local_storage_unavailable": "Local Storage is unavailable", + "local_storage_unavailable_message": "Your browser doesn't support local storage", + "object": "Object", + "objects_other": "Objects", + "continue": "Continue", + "select_visible_columns": "Select visible columns", + "tutorial": "Tutorials", + "tutorial_other": "Take a tour", + "docs": "Docs", + "discord": "Discord", + "danger_zone": "Danger Zone", + "control_plane": "Control plane", + "refresh": "Refresh", + "quickstart": "Quickstart", + "ask_ai": "Ask AI", + "new": "New", + "full_view": "Full view" }, - "table": { - "region": "Region", - "bucket": "Storage" + "auth": { + "invalid_token": "Invalid token", + "you_are_not_logged_in": "You are not logged in", + "contact_to_administrator": "For getting the authorization token, contact to the administrator", + "sign_in_to_dstack": "Welcome to dstack Sky", + "sign_in_to_dstack_enterprise": "Welcome to dstack", + "authorization_failed": "Authorization is failed", + "try_again": "Please try again", + "login_by_token": "Sign in via a token", + "another_login_methods": "Other sign in options" }, - "edit": { - "success_notification": "Project updating is successful", - "delete_backend_confirm_title": "Delete backend", - "delete_backend_confirm_message": "Are you sure you want to delete this backend?", - "delete_backends_confirm_title": "Delete backends", - "delete_backends_confirm_message": "Are you sure you want to delete these backends?" + "navigation": { + "settings": "Settings", + "runs": "Runs", + "models": "Models", + "fleets": "Fleets", + "fleet": "Fleet", + "project": "project", + "project_other": "Projects", + "general": "General", + "users": "Users", + "user_settings": "User settings", + "account": "User", + "billing": "Billing", + "resources": "Resources", + "volumes": "Volumes", + "instances": "Instances", + "offers": "Offers", + "events": "Events" }, - "create": { - "success_notification": "Backend is created" - } - }, + "backend": { + "page_title_one": "Backend", + "page_title_other": "Backends", + "add_backend": "Add backend", + "edit_backend": "Edit backend", + "empty_message_title": "No backends", + "empty_message_text": "No backends to display.", + "type": { + "aws": "AWS", + "aws_description": "Run workflows and store data in Amazon Web Services ", + "gcp": "GCP", + "gcp_description": "Run workflows and store data in Google Cloud Platform", + "azure": "Azure", + "azure_description": "Run workflows and store data in Microsoft Azure", + "lambda": "Lambda", + "lambda_description": "Run workflows and store data in Lambda", + "local": "Local", + "local_description": "Run workflows and store data locally via Docker" + }, - "gateway": { - "page_title_one": "Gateway", - "page_title_other": "Gateways", - "add_gateway": "Add gateway", - "edit_gateway": "Edit gateway", - "empty_message_title": "No gateways", - "empty_message_text": "No gateways to display.", + "table": { + "region": "Region", + "bucket": "Storage" + }, - "edit": { - "backend": "Backend", - "backend_description": "Select a backend", - "region": "Region", - "region_description": "Select a region", - "default": "Default", - "default_checkbox": "Turn on default", - "external_ip": "External IP", - "wildcard_domain": "Wildcard domain", - "wildcard_domain_description": "Specify the wildcard domain mapped to the external IP.", - "wildcard_domain_placeholder": "*.mydomain.com", - "delete_gateway_confirm_title": "Delete gateway", - "delete_gateway_confirm_message": "Are you sure you want to delete this gateway?", - "delete_gateways_confirm_title": "Delete gateways", - "delete_gateways_confirm_message": "Are you sure you want to delete these gateways?", + "edit": { + "success_notification": "Project updating is successful", + "delete_backend_confirm_title": "Delete backend", + "delete_backend_confirm_message": "Are you sure you want to delete this backend?", + "delete_backends_confirm_title": "Delete backends", + "delete_backends_confirm_message": "Are you sure you want to delete these backends?" + }, - "validation": { - "wildcard_domain_format": "Should use next format: {{pattern}}" - } + "create": { + "success_notification": "Backend is created" + } }, - "create": { - "success_notification": "Gateway is created", - "creating_notification": "The gateway is creating. It may take some time" - }, + "gateway": { + "page_title_one": "Gateway", + "page_title_other": "Gateways", + "add_gateway": "Add gateway", + "edit_gateway": "Edit gateway", + "empty_message_title": "No gateways", + "empty_message_text": "No gateways to display.", - "update": { - "success_notification": "Gateway is updated" - }, + "edit": { + "backend": "Backend", + "backend_description": "Select a backend", + "region": "Region", + "region_description": "Select a region", + "default": "Default", + "default_checkbox": "Turn on default", + "external_ip": "External IP", + "wildcard_domain": "Wildcard domain", + "wildcard_domain_description": "Specify the wildcard domain mapped to the external IP.", + "wildcard_domain_placeholder": "*.mydomain.com", + "delete_gateway_confirm_title": "Delete gateway", + "delete_gateway_confirm_message": "Are you sure you want to delete this gateway?", + "delete_gateways_confirm_title": "Delete gateways", + "delete_gateways_confirm_message": "Are you sure you want to delete these gateways?", - "test_domain": { - "success_notification": "Domain is valid" - } - }, + "validation": { + "wildcard_domain_format": "Should use next format: {{pattern}}" + } + }, - "projects": { - "page_title": "Projects", - "search_placeholder": "Find projects", - "empty_message_title": "No projects", - "empty_message_text": "No projects to display.", - "nomatch_message_title": "No matches", - "nomatch_message_text": "We can't find a match.", - "nomatch_message_button_label": "Clear filter", - "repositories": "Repositories", - "runs": "Runs", - "tags": "Tags", - "events": "Events", - "settings": "Settings", - "join": "Join", - "leave_confirm_title": "Leave project", - "leave_confirm_message": "Are you sure you want to leave this project?", - "leave": "Leave", - "join_success": "Successfully joined the project", - "leave_success": "Successfully left the project", - "join_error": "Failed to join project", - "leave_error": "Failed to leave project", - "card": { - "backend": "Backend", - "settings": "Settings" - }, - "wizard": { - "submit": "Create" - }, - "edit": { - "general": "General", - "project_name": "Name", - "owner": "Owner", - "project_name_description": "Only latin characters, dashes, underscores, and digits", - "project_type": "Project type", - "project_type_description": "Choose which project type you want to create", - "backends": "Backends", - "base_backends_description": "dstack will automatically collect offers from the following providers. Deselect providers you don’t want to use.", - "backends_description": "The following backends can be configured with your own cloud credentials in the project settings after the project is created.", - "create_default_fleet": "Create a default fleet", - "default_fleet": "Default fleet", - "default_fleet_description": "At least one fleet is required to run dev environments, tasks, or services.", - "is_public": "Public", - "is_public_description": "Allow any user join the project as a member", - "backend": "Backend", - "backend_config": "Backend config", - "backend_config_description": "Specify the backend config in the YAML format. Click Info for examples.", - "backend_type": "Type", - "backend_type_description": "Select a backend type", - "members_empty_message_title": "No members", - "members_empty_message_text": "Select project's members", - "update_members_success": "Members are updated", - "update_visibility_success": "Project visibility updated successfully", - "update_visibility_confirm_title": "Change project visibility", - "update_visibility_confirm_message": "Are you sure you want to change the project visibility? This will affect who can access this project.", - "change_visibility": "Change", - "project_visibility": "Visibility", - "project_visibility_description": "Control who can access this project", - "make_project_public": "Make project public", - "delete_project_confirm_title": "Delete project", - "delete_project_confirm_message": "Are you sure you want to delete this project?", - "delete_projects_confirm_title": "Delete projects", - "delete_projects_confirm_message": "Are you sure you want to delete these projects?", - "delete_this_project": "Delete this project", - "cli": "CLI", - "aws": { - "authorization": "Authorization", - "authorization_default": "Default credentials", - "authorization_access_key": "Access key", - "access_key": "Access key", - "access_key_id": "Access key ID", - "access_key_id_description": "Specify the AWS access key ID", - "secret_key": "Secret key", - "secret_key_id": "Secret access key", - "secret_key_id_description": "Specify the AWS secret access key", - "regions": "Regions", - "regions_description": "Select regions to run workflows and store artifacts", - "regions_placeholder": "Select regions", - "s3_bucket_name": "Bucket", - "s3_bucket_name_description": "Select an S3 bucket to store artifacts", - "ec2_subnet_id": "Subnet", - "ec2_subnet_id_description": "Select a subnet to run workflows in", - "ec2_subnet_id_placeholder": "Not selected", - "vpc_name": "VPC", - "vpc_name_description": "Enter a vpc" - }, - "azure" : { - "authorization": "Authorization", - "authorization_default": "Default credentials", - "authorization_client": "Client secret", - "tenant_id": "Tenant ID", - "tenant_id_description": "Specify an Azure tenant ID", - "tenant_id_placeholder": "Not selected", - "client_id": "Client ID", - "client_id_description": "Specify an Azure client (application) ID", - "client_secret": "Client secret", - "client_secret_description": "Specify an Azure client (application) secret", - "subscription_id": "Subscription ID", - "subscription_id_description": "Select an Azure subscription ID", - "subscription_id_placeholder": "Not selected", - "locations": "Locations", - "locations_description": "Select locations to run workflows", - "locations_placeholder": "Select locations", - "storage_account": "Storage account", - "storage_account_description": "Select an Azure storage account to store artifacts", - "storage_account_placeholder": "Not selected" + "create": { + "success_notification": "Gateway is created", + "creating_notification": "The gateway is creating. It may take some time" + }, - }, - "gcp": { - "authorization": "Authorization", - "authorization_default": "Default credentials", - "service_account": "Service account key", - "credentials_description": "Credentials description", - "credentials_placeholder": "Credentials placeholder", - "regions": "Regions", - "regions_description": "Select regions to run workflows and store artifacts", - "regions_placeholder": "Select regions", - "project_id": "Project Id", - "project_id_description": "Select a project id", - "project_id_placeholder": "Select a project Id" - }, - "lambda": { - "api_key": "API key", - "api_key_description": "Specify the Lambda API key", - "regions": "Regions", - "regions_description": "Select regions to run workflows", - "regions_placeholder": "Select regions", - "storage_backend": { - "type": "Storage", - "type_description": "Select backend storage", - "type_placeholder": "Select type", - "credentials": { - "access_key_id": "Access key ID", - "access_key_id_description": "Specify the AWS access key ID", - "secret_key_id": "Secret access key", - "secret_key_id_description": "Specify the AWS secret access key" - }, - "s3_bucket_name": "Bucket", - "s3_bucket_name_description": "Select an S3 bucket to store artifacts" - } - }, - "local": { - "path": "Files path" - }, - "members": { - "section_title": "Members", - "name": "User name", - "role": "Project role" - }, - "secrets": { - "section_title": "Secrets", - "empty_message_title": "No secrets", - "empty_message_text": "No secrets to display.", - "name": "Secret name", - "value": "Secret value", - "create_secret": "Create secret", - "update_secret": "Update secret", - "delete_confirm_title": "Delete secret", - "delete_confirm_message": "Are you sure you want to delete the {{name}} secret?", - "multiple_delete_confirm_title": "Delete secrets", - "multiple_delete_confirm_message": "Are you sure you want to delete {{count}} secrets?", - "not_permissions_title": "No permissions", - "not_permissions_description": "You don't have permissions for managing secrets", - "validation": { - "secret_name_format": "Invalid secret name" + "update": { + "success_notification": "Gateway is updated" + }, + + "test_domain": { + "success_notification": "Domain is valid" } - }, - "error_notification": "Update project error", - "validation": { - "user_name_format": "Only letters, numbers, - or _" - }, - "visibility": { - "private": "Private", - "public": "Public" - } - }, - "create": { - "page_title": "Create project", - "error_notification": "Create project error", - "success_notification": "Project is created" }, - "repo": { - "search_placeholder": "Find repositories", - "empty_message_title": "No repositories", - "empty_message_text": "No repositories to display.", - "nomatch_message_title": "No matches", - "nomatch_message_text": "We can't find a match.", - "card": { - "owner": "Owner", - "last_run": "Last run", - "tags_count": "Tags count", - "directory": "Directory" - }, - "secrets": { - "table_title": "Secrets", - "add_modal_title": "Add secret", - "update_modal_title": "Update secret", - "name": "Secret name", - "name_description": "Secret name", - "value": "Secret value", - "value_description": "Secret value", - "search_placeholder": "Find secrets", - "empty_message_title": "No secrets", - "empty_message_text": "No secrets to display." - } + + "projects": { + "page_title": "Projects", + "search_placeholder": "Find projects", + "empty_message_title": "No projects", + "empty_message_text": "No projects to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "nomatch_message_button_label": "Clear filter", + "repositories": "Repositories", + "runs": "Runs", + "tags": "Tags", + "events": "Events", + "settings": "Settings", + "join": "Join", + "leave_confirm_title": "Leave project", + "leave_confirm_message": "Are you sure you want to leave this project?", + "leave": "Leave", + "join_success": "Successfully joined the project", + "leave_success": "Successfully left the project", + "join_error": "Failed to join project", + "leave_error": "Failed to leave project", + "card": { + "backend": "Backend", + "settings": "Settings" + }, + "wizard": { + "submit": "Create" + }, + "edit": { + "general": "General", + "project_name": "Name", + "owner": "Owner", + "project_name_description": "Only latin characters, dashes, underscores, and digits", + "project_type": "Project type", + "project_type_description": "Choose which project type you want to create", + "backends": "Backends", + "base_backends_description": "dstack will automatically collect offers from the following providers. Deselect providers you don’t want to use.", + "backends_description": "The following backends can be configured with your own cloud credentials in the project settings after the project is created.", + "create_default_fleet": "Create a default fleet", + "default_fleet": "Default fleet", + "default_fleet_description": "At least one fleet is required to run dev environments, tasks, or services.", + "is_public": "Public", + "is_public_description": "Allow any user join the project as a member", + "backend": "Backend", + "backend_config": "Backend config", + "backend_config_description": "Specify the backend config in the YAML format. Click Info for examples.", + "backend_type": "Type", + "backend_type_description": "Select a backend type", + "members_empty_message_title": "No members", + "members_empty_message_text": "Select project's members", + "update_members_success": "Members are updated", + "update_visibility_success": "Project visibility updated successfully", + "update_visibility_confirm_title": "Change project visibility", + "update_visibility_confirm_message": "Are you sure you want to change the project visibility? This will affect who can access this project.", + "change_visibility": "Change", + "project_visibility": "Visibility", + "project_visibility_description": "Control who can access this project", + "make_project_public": "Make project public", + "delete_project_confirm_title": "Delete project", + "delete_project_confirm_message": "Are you sure you want to delete this project?", + "delete_projects_confirm_title": "Delete projects", + "delete_projects_confirm_message": "Are you sure you want to delete these projects?", + "delete_this_project": "Delete this project", + "cli": "CLI", + "aws": { + "authorization": "Authorization", + "authorization_default": "Default credentials", + "authorization_access_key": "Access key", + "access_key": "Access key", + "access_key_id": "Access key ID", + "access_key_id_description": "Specify the AWS access key ID", + "secret_key": "Secret key", + "secret_key_id": "Secret access key", + "secret_key_id_description": "Specify the AWS secret access key", + "regions": "Regions", + "regions_description": "Select regions to run workflows and store artifacts", + "regions_placeholder": "Select regions", + "s3_bucket_name": "Bucket", + "s3_bucket_name_description": "Select an S3 bucket to store artifacts", + "ec2_subnet_id": "Subnet", + "ec2_subnet_id_description": "Select a subnet to run workflows in", + "ec2_subnet_id_placeholder": "Not selected", + "vpc_name": "VPC", + "vpc_name_description": "Enter a vpc" + }, + "azure": { + "authorization": "Authorization", + "authorization_default": "Default credentials", + "authorization_client": "Client secret", + "tenant_id": "Tenant ID", + "tenant_id_description": "Specify an Azure tenant ID", + "tenant_id_placeholder": "Not selected", + "client_id": "Client ID", + "client_id_description": "Specify an Azure client (application) ID", + "client_secret": "Client secret", + "client_secret_description": "Specify an Azure client (application) secret", + "subscription_id": "Subscription ID", + "subscription_id_description": "Select an Azure subscription ID", + "subscription_id_placeholder": "Not selected", + "locations": "Locations", + "locations_description": "Select locations to run workflows", + "locations_placeholder": "Select locations", + "storage_account": "Storage account", + "storage_account_description": "Select an Azure storage account to store artifacts", + "storage_account_placeholder": "Not selected" + }, + "gcp": { + "authorization": "Authorization", + "authorization_default": "Default credentials", + "service_account": "Service account key", + "credentials_description": "Credentials description", + "credentials_placeholder": "Credentials placeholder", + "regions": "Regions", + "regions_description": "Select regions to run workflows and store artifacts", + "regions_placeholder": "Select regions", + "project_id": "Project Id", + "project_id_description": "Select a project id", + "project_id_placeholder": "Select a project Id" + }, + "lambda": { + "api_key": "API key", + "api_key_description": "Specify the Lambda API key", + "regions": "Regions", + "regions_description": "Select regions to run workflows", + "regions_placeholder": "Select regions", + "storage_backend": { + "type": "Storage", + "type_description": "Select backend storage", + "type_placeholder": "Select type", + "credentials": { + "access_key_id": "Access key ID", + "access_key_id_description": "Specify the AWS access key ID", + "secret_key_id": "Secret access key", + "secret_key_id_description": "Specify the AWS secret access key" + }, + "s3_bucket_name": "Bucket", + "s3_bucket_name_description": "Select an S3 bucket to store artifacts" + } + }, + "local": { + "path": "Files path" + }, + "members": { + "section_title": "Members", + "name": "User name", + "role": "Project role" + }, + "secrets": { + "section_title": "Secrets", + "empty_message_title": "No secrets", + "empty_message_text": "No secrets to display.", + "name": "Secret name", + "value": "Secret value", + "create_secret": "Create secret", + "update_secret": "Update secret", + "delete_confirm_title": "Delete secret", + "delete_confirm_message": "Are you sure you want to delete the {{name}} secret?", + "multiple_delete_confirm_title": "Delete secrets", + "multiple_delete_confirm_message": "Are you sure you want to delete {{count}} secrets?", + "not_permissions_title": "No permissions", + "not_permissions_description": "You don't have permissions for managing secrets", + "validation": { + "secret_name_format": "Invalid secret name" + } + }, + "error_notification": "Update project error", + "validation": { + "user_name_format": "Only letters, numbers, - or _" + }, + "visibility": { + "private": "Private", + "public": "Public" + } + }, + "create": { + "page_title": "Create project", + "error_notification": "Create project error", + "success_notification": "Project is created" + }, + "repo": { + "search_placeholder": "Find repositories", + "empty_message_title": "No repositories", + "empty_message_text": "No repositories to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "card": { + "owner": "Owner", + "last_run": "Last run", + "tags_count": "Tags count", + "directory": "Directory" + }, + "secrets": { + "table_title": "Secrets", + "add_modal_title": "Add secret", + "update_modal_title": "Update secret", + "name": "Secret name", + "name_description": "Secret name", + "value": "Secret value", + "value_description": "Secret value", + "search_placeholder": "Find secrets", + "empty_message_title": "No secrets", + "empty_message_text": "No secrets to display." + } + }, + "run": { + "list_page_title": "Runs", + "search_placeholder": "Find runs", + "empty_message_title": "No runs", + "empty_message_text": "No runs to display.", + "quickstart_message_text": "Check out the quickstart guide to get started with dstack", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match. Try to change project or clear filter", + "filter_property_placeholder": "Filter by properties", + "project": "Project", + "project_placeholder": "Filtering by project", + "repo": "Repository", + "repo_placeholder": "Filtering by repository", + "user": "User", + "user_placeholder": "Filtering by user", + "active_only": "Active runs", + "log": "Logs", + "log_empty_message_title": "No logs", + "log_empty_message_text": "No logs to display.", + "inspect": "Inspect", + "run_name": "Name", + "workflow_name": "Workflow", + "configuration": "Configuration", + "instance": "Instance", + "priority": "Priority", + "provider_name": "Provider", + "status": "Status", + "probe": "Probes", + "submitted_at": "Submitted", + "finished_at": "Finished", + "metrics": { + "title": "Metrics", + "show_metrics": "Show metrics", + "cpu_utilization": "CPU utilization %", + "memory_used": "System memory used", + "per_each_cpu_utilization": "GPU utilization %", + "per_each_memory_used": "GPU memory used" + }, + "jobs": "Jobs", + "job_name": "Job Name", + "cost": "Cost", + "backend": "Backend", + "region": "Region", + "instance_id": "Instance ID", + "schedule": "Schedule", + "next_run": "Next run", + "resources": "Resources", + "spot": "Spot", + "termination_reason": "Termination reason", + "price": "Price", + "error": "Error", + "artifacts": "Artifacts", + "artifacts_count": "Artifacts", + "hub_user_name": "User", + "service_url": "Service URL", + "statuses": { + "pending": "Pending", + "submitted": "Submitted", + "provisioning": "Provisioning", + "pulling": "Pulling", + "downloading": "Downloading", + "running": "Running", + "uploading": "Uploading", + "stopping": "Stopping", + "stopped": "Stopped", + "terminating": "Terminating", + "terminated": "Terminated", + "aborting": "Aborting", + "aborted": "Aborted", + "failed": "Failed", + "done": "Done", + "building": "Building" + } + }, + "tag": { + "list_page_title": "Artifacts", + "search_placeholder": "Find tags", + "empty_message_title": "No tags", + "empty_message_text": "No tags to display.", + "tag_name": "Tag", + "run_name": "Run", + "artifacts": "Files" + }, + "artifact": { + "list_page_title": "Artifacts", + "search_placeholder": "Find objects", + "empty_message_title": "No objects", + "empty_message_text": "No objects to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "name": "Name", + "type": "Type", + "size": "Size" + } }, - "run": { - "list_page_title": "Runs", - "search_placeholder": "Find runs", - "empty_message_title": "No runs", - "empty_message_text": "No runs to display.", - "quickstart_message_text": "Check out the quickstart guide to get started with dstack", - "nomatch_message_title": "No matches", - "nomatch_message_text": "We can't find a match. Try to change project or clear filter", - "filter_property_placeholder": "Filter by properties", - "project": "Project", - "project_placeholder": "Filtering by project", - "repo": "Repository", - "repo_placeholder": "Filtering by repository", - "user": "User", - "user_placeholder": "Filtering by user", - "active_only": "Active runs", - "log": "Logs", - "log_empty_message_title": "No logs", - "log_empty_message_text": "No logs to display.", - "inspect": "Inspect", - "run_name": "Name", - "workflow_name": "Workflow", - "configuration": "Configuration", - "instance": "Instance", - "priority": "Priority", - "provider_name": "Provider", - "status": "Status", - "probe": "Probes", - "submitted_at": "Submitted", - "finished_at": "Finished", - "metrics": { - "title": "Metrics", - "show_metrics": "Show metrics", - "cpu_utilization": "CPU utilization %", - "memory_used": "System memory used", - "per_each_cpu_utilization": "GPU utilization %", - "per_each_memory_used": "GPU memory used" - }, - "jobs": "Jobs", - "job_name": "Job Name", - "cost": "Cost", - "backend": "Backend", - "region": "Region", - "instance_id": "Instance ID", - "schedule": "Schedule", - "next_run": "Next run", - "resources": "Resources", - "spot": "Spot", - "termination_reason": "Termination reason", - "price": "Price", - "error": "Error", - "artifacts": "Artifacts", - "artifacts_count": "Artifacts", - "hub_user_name": "User", - "service_url": "Service URL", - "statuses": { - "pending": "Pending", - "submitted": "Submitted", - "provisioning": "Provisioning", - "pulling": "Pulling", - "downloading": "Downloading", - "running": "Running", - "uploading": "Uploading", - "stopping": "Stopping", - "stopped": "Stopped", - "terminating": "Terminating", - "terminated": "Terminated", - "aborting": "Aborting", - "aborted": "Aborted", - "failed": "Failed", - "done": "Done", - "building": "Building" - } + "runs": { + "launch_button": "Launch", + "launch": { + "wizard": { + "title": "Launch", + "submit": "Apply", + "project": "Project", + "project_description": "Select a project", + "project_empty": "No options", + "project_loading": "Loading options", + "template": "Template", + "template_description": "Select a template", + "template_empty": "No options", + "template_loading": "Loading options", + "template_placeholder": "Select a project to select a template", + "template_card_type": "Type", + "gpu": "GPU", + "gpu_description": "Enable to select a GPU offer. Disable to run without a GPU.", + "offer": "Offer", + "offer_description": "Select an offer for the run.", + "name": "Name", + "name_description": "The name of the run, e.g. 'my-dev-env'", + "name_constraint": "Example: 'my-fleet' or 'default'. If not specified, generated automatically.", + "name_placeholder": "Optional", + "ide": "IDE", + "ide_description": "Select which IDE would you like to use with the dev environment.", + "docker": "Docker", + "docker_image": "Image", + "docker_image_description": "A Docker image name, e.g. 'lmsysorg/sglang:latest'", + "docker_image_constraint": "The image must be public", + "docker_image_placeholder": "Required", + "python": "Python", + "python_description": "The version of Python, e.g. '3.12'", + "python_placeholder": "Optional", + "repo": "Repo", + "working_dir": "Working dir", + "working_dir_description": "The absolute path to the working directory inside the container, e.g. '/home/user/project'", + "working_dir_placeholder": "Optional", + "working_dir_constraint": "By default, set to '/workflow'", + "repo_url": "URL", + "repo_url_description": "A URL of a Git repository, e.g. 'https://github.com/user/repo'", + "repo_url_constraint": "The repo must be public", + "repo_url_placeholder": "Required", + "repo_path": "Path", + "repo_path_description": "The path inside the container to clone the repository, e.g. '/home/user/project'", + "repo_path_placeholder": "Optional", + "repo_path_constraint": "By default, set to '/workflow'", + "config": "Configuration file", + "configuration_label": "Configuration", + "configuration_description": "Review and adjust the configuration if needed.", + "success_notification": "The run is submitted!" + } + } }, - "tag": { - "list_page_title": "Artifacts", - "search_placeholder": "Find tags", - "empty_message_title": "No tags", - "empty_message_text": "No tags to display.", - "tag_name": "Tag", - "run_name": "Run", - "artifacts": "Files" + "offer": { + "title": "Offers", + "filter_property_placeholder": "Filter by properties", + "backend": "Backend", + "backend_plural": "Backends", + "availability": "Availability", + "groupBy": "Group by properties", + "region": "Region", + "count": "Count", + "price": "$/GPU", + "memory_mib": "Memory", + "spot": "Spot policy", + "empty_message_title_select_project": "Select a project", + "empty_message_text_select_project": "Use the filter above to select a project", + "empty_message_title_select_groupBy": "Select a group by", + "empty_message_text_select_groupBy": "Use the field above to select a group by", + "empty_message_title": "No offers", + "empty_message_text": "No offers to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match." }, - "artifact": { - "list_page_title": "Artifacts", - "search_placeholder": "Find objects", - "empty_message_title": "No objects", - "empty_message_text": "No objects to display.", - "nomatch_message_title": "No matches", - "nomatch_message_text": "We can't find a match.", - "name": "Name", - "type": "Type", - "size": "Size" - } - }, - "runs": { - "dev_env": { - "wizard": { - "title": "New dev environment", - "submit": "Apply", - "offer": "Offer", - "offer_description": "Select an offer for the dev environment.", - "name": "Name", - "name_description": "The name of the run, e.g. 'my-dev-env'", - "name_constraint": "Example: 'my-fleet' or 'default'. If not specified, generated automatically.", - "name_placeholder": "Optional", - "ide": "IDE", - "ide_description": "Select which IDE would you like to use with the dev environment.", - "docker": "Docker", - "docker_image": "Image", - "docker_image_description": "A Docker image name, e.g. 'lmsysorg/sglang:latest'", - "docker_image_constraint": "The image must be public", - "docker_image_placeholder": "Required", - "python": "Python", - "python_description": "The version of Python, e.g. '3.12'", - "python_placeholder": "Optional", - "repo": "Repo", - "working_dir": "Working dir", - "working_dir_description": "The absolute path to the working directory inside the container, e.g. '/home/user/project'", - "working_dir_placeholder": "Optional", - "working_dir_constraint": "By default, set to '/workflow'", - "repo_url": "URL", - "repo_url_description": "A URL of a Git repository, e.g. 'https://github.com/user/repo'", - "repo_url_constraint": "The repo must be public", - "repo_url_placeholder": "Required", - "repo_path": "Path", - "repo_path_description": "The path inside the container to clone the repository, e.g. '/home/user/project'", - "repo_path_placeholder": "Optional", - "repo_path_constraint": "By default, set to '/workflow'", - "config": "Configuration file", - "config_description": "Review the configuration file and adjust it if needed. Click Info for examples.", - "success_notification": "The run is submitted!" - } - } - }, - "offer": { - "title": "Offers", - "filter_property_placeholder": "Filter by properties", - "backend": "Backend", - "backend_plural": "Backends", - "availability": "Availability", - "groupBy": "Group by properties", - "region": "Region", - "count": "Count", - "price": "$/GPU", - "memory_mib": "Memory", - "spot": "Spot policy", - "empty_message_title_select_project": "Select a project", - "empty_message_text_select_project": "Use the filter above to select a project", - "empty_message_title_select_groupBy": "Select a group by", - "empty_message_text_select_groupBy": "Use the field above to select a group by", - "empty_message_title": "No offers", - "empty_message_text": "No offers to display.", - "nomatch_message_title": "No matches", - "nomatch_message_text": "We can't find a match." - }, - "models": { - "model_name": "Name", - "url": "URL", - "gateway": "Gateway", - "type": "Type", - "run": "Run", - "resources": "Resources", - "price": "Price", - "submitted_at": "Submitted", - "user": "User", - "repository": "Repository", - "backend": "Backend", - "code": "Code", - "empty_message_title": "No models", - "empty_message_text": "No models to display.", - "nomatch_message_title": "No matches", - "nomatch_message_text": "We can't find a match.", - "nomatch_message_button_label": "Clear filter", + "models": { + "model_name": "Name", + "url": "URL", + "gateway": "Gateway", + "type": "Type", + "run": "Run", + "resources": "Resources", + "price": "Price", + "submitted_at": "Submitted", + "user": "User", + "repository": "Repository", + "backend": "Backend", + "code": "Code", + "empty_message_title": "No models", + "empty_message_text": "No models to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "nomatch_message_button_label": "Clear filter", - "details": { - "instructions": "System", - "instructions_description": "Specify system", - "message_placeholder": "Enter your question", - "chat_empty_title": "No messages yet", - "chat_empty_message": "Please start a chat", - "run_name": "Run name", - "view_code": "View code", - "view_code_description": "You can use the following code to start integrating your current prompt and settings into your application." - } - }, - - "fleets": { - "no_alert": { - "title": "No fleets", - "description": "The project has no fleets. Create one before submitting a run.", - "button_title": "Create a fleet" - }, - "fleet": "Fleet", - "fleet_placeholder": "Filtering by fleet", - "fleet_name": "Fleet name", - "total_instances": "Number of instances", - "inspect": "Inspect", - "empty_message_title": "No fleets", - "empty_message_text": "No fleets to display.", - "nomatch_message_title": "No matches", - "nomatch_message_text": "We can't find a match.", - "nomatch_message_button_label": "Clear filter", - "active_only": "Active fleets", - "filter_property_placeholder": "Filter by properties", - "statuses": { - "active": "Active", - "submitted": "Submitted", - "failed": "Failed", - "terminating": "Terminating", - "terminated": "Terminated" + "details": { + "instructions": "System", + "instructions_description": "Specify system", + "message_placeholder": "Enter your question", + "chat_empty_title": "No messages yet", + "chat_empty_message": "Please start a chat", + "run_name": "Run name", + "view_code": "View code", + "view_code_description": "You can use the following code to start integrating your current prompt and settings into your application." + } }, - "create": { - "success_notification": "The fleet is created!" + + "fleets": { + "no_alert": { + "title": "No fleets", + "description": "The project has no fleets. Create one before submitting a run.", + "button_title": "Create a fleet" + }, + "fleet": "Fleet", + "fleet_placeholder": "Filtering by fleet", + "fleet_name": "Fleet name", + "total_instances": "Number of instances", + "inspect": "Inspect", + "empty_message_title": "No fleets", + "empty_message_text": "No fleets to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "nomatch_message_button_label": "Clear filter", + "active_only": "Active fleets", + "filter_property_placeholder": "Filter by properties", + "statuses": { + "active": "Active", + "submitted": "Submitted", + "failed": "Failed", + "terminating": "Terminating", + "terminated": "Terminated" + }, + "create": { + "success_notification": "The fleet is created!" + }, + "instances": { + "active_only": "Active instances", + "filter_property_placeholder": "Filter by properties", + "title": "Instances", + "empty_message_title": "No instances", + "empty_message_text": "No instances to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "instance_name": "Instance", + "instance_num": "Instance num", + "created": "Created", + "status": "Status", + "project": "Project", + "hostname": "Host name", + "instance_type": "Type", + "statuses": { + "pending": "Pending", + "provisioning": "Provisioning", + "idle": "Idle", + "busy": "Busy", + "terminating": "Terminating", + "terminated": "Terminated" + }, + "resources": "Resources", + "backend": "Backend", + "region": "Region", + "spot": "Spot", + "started": "Started", + "price": "Price" + }, + "edit": { + "name": "Name", + "name_description": "The name of the fleet, e.g. 'my-fleet'", + "name_placeholder": "Optional", + "name_constraint": "Example: 'my-fleet' or 'default'. If not specified, generated automatically.", + "min_instances": "Min number of instances", + "min_instances_description": "Set it '0' to provision instances only when required", + "max_instances": "Max number of instances", + "max_instances_description": "Required only if you want to set an upper limit", + "max_instances_placeholder": "Optional", + "idle_duration": "Idle duration", + "idle_duration_description": "Example: '0s', '1m', '1h'", + "spot_policy": "Spot policy", + "spot_policy_description": "Set it to 'auto' to allow the use of both on-demand and spot instances" + } }, - "instances": { - "active_only": "Active instances", - "filter_property_placeholder": "Filter by properties", - "title": "Instances", - "empty_message_title": "No instances", - "empty_message_text": "No instances to display.", - "nomatch_message_title": "No matches", - "nomatch_message_text": "We can't find a match.", - "instance_name": "Instance", - "instance_num": "Instance num", - "created": "Created", - "status": "Status", - "project": "Project", - "hostname": "Host name", - "instance_type": "Type", - "statuses": { - "pending": "Pending", - "provisioning": "Provisioning", - "idle": "Idle", - "busy": "Busy", - "terminating": "Terminating", - "terminated": "Terminated" - }, - "resources": "Resources", - "backend": "Backend", - "region": "Region", - "spot": "Spot", - "started": "Started", - "price": "Price" + "volume": { + "volumes": "Volumes", + "empty_message_title": "No volumes", + "empty_message_text": "No volumes to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "delete_volumes_confirm_title": "Delete volumes", + "delete_volumes_confirm_message": "Are you sure you want to delete these volumes?", + "active_only": "Active volumes", + "filter_property_placeholder": "Filter by properties", + + "name": "Name", + "project": "Project name", + "region": "Region", + "backend": "Backend", + "status": "Status", + "created": "Created", + "finished": "Finished", + "price": "Price (per month)", + "cost": "Cost", + "statuses": { + "failed": "Failed", + "submitted": "Submitted", + "provisioning": "Provisioning", + "active": "Active", + "deleted": "Deleted" + } }, - "edit": { - "name": "Name", - "name_description": "The name of the fleet, e.g. 'my-fleet'", - "name_placeholder": "Optional", - "name_constraint": "Example: 'my-fleet' or 'default'. If not specified, generated automatically.", - "min_instances": "Min number of instances", - "min_instances_description": "Set it '0' to provision instances only when required", - "max_instances": "Max number of instances", - "max_instances_description": "Required only if you want to set an upper limit", - "max_instances_placeholder": "Optional", - "idle_duration": "Idle duration", - "idle_duration_description": "Example: '0s', '1m', '1h'", - "spot_policy": "Spot policy", - "spot_policy_description": "Set it to 'auto' to allow the use of both on-demand and spot instances" - } - }, - "volume": { - "volumes": "Volumes", - "empty_message_title": "No volumes", - "empty_message_text": "No volumes to display.", - "nomatch_message_title": "No matches", - "nomatch_message_text": "We can't find a match.", - "delete_volumes_confirm_title": "Delete volumes", - "delete_volumes_confirm_message": "Are you sure you want to delete these volumes?", - "active_only": "Active volumes", - "filter_property_placeholder": "Filter by properties", - "name": "Name", - "project": "Project name", - "region": "Region", - "backend": "Backend", - "status": "Status", - "created": "Created", - "finished": "Finished", - "price": "Price (per month)", - "cost": "Cost", - "statuses": { - "failed": "Failed", - "submitted": "Submitted", - "provisioning": "Provisioning", - "active": "Active", - "deleted": "Deleted" - } - }, + "events": { + "recorded_at": "Recorded At", + "actor": "Actor", + "targets": "Targets", + "message": "Message" + }, - "events": { - "recorded_at": "Recorded At", - "actor": "Actor", - "targets": "Targets", - "message": "Message" - }, + "users": { + "page_title": "Users", + "search_placeholder": "Find members", + "empty_message_title": "No members", + "empty_message_text": "No members to display.", + "nomatch_message_title": "No matches", + "nomatch_message_text": "We can't find a match.", + "user_name": "User name", + "user_name_description": "Only latin characters, dashes, underscores, and digits", + "global_role_description": "Whether the user is an administrator or not", + "email_description": "Enter user email", + "token": "Token", + "token_description": "Specify use your personal access token", + "global_role": "Global role", + "active": "Active", + "active_description": "Specify user activation", + "activated": "Activated", + "deactivated": "Deactivated", + "email": "Email", + "created_at": "Created at", + "account": "User", + "account_settings": "User settings", + "settings": "Settings", + "projects": "Projects", + "events": "Events", + "create": { + "page_title": "Create user", + "error_notification": "Create user error", + "success_notification": "User is created" + }, + "edit": { + "error_notification": "Update user error", + "success_notification": "User updating is successful", + "refresh_token_success_notification": "Token rotating is successful", + "refresh_token_error_notification": "Token rotating error", + "refresh_token_confirm_title": "Rotate token", + "refresh_token_confirm_message": "Are you sure you want to rotate token?", + "refresh_token_button_label": "Rotate", + "validation": { + "user_name_format": "Only letters, numbers, - or _", + "email_format": "Incorrect email" + } + }, - "users": { - "page_title": "Users", - "search_placeholder": "Find members", - "empty_message_title": "No members", - "empty_message_text": "No members to display.", - "nomatch_message_title": "No matches", - "nomatch_message_text": "We can't find a match.", - "user_name": "User name", - "user_name_description": "Only latin characters, dashes, underscores, and digits", - "global_role_description": "Whether the user is an administrator or not", - "email_description": "Enter user email", - "token": "Token", - "token_description": "Specify use your personal access token", - "global_role": "Global role", - "active": "Active", - "active_description": "Specify user activation", - "activated": "Activated", - "deactivated": "Deactivated", - "email": "Email", - "created_at": "Created at", - "account": "User", - "account_settings": "User settings", - "settings": "Settings", - "projects": "Projects", - "events": "Events", - "create": { - "page_title": "Create user", - "error_notification": "Create user error", - "success_notification": "User is created" - }, - "edit": { - "error_notification": "Update user error", - "success_notification": "User updating is successful", - "refresh_token_success_notification": "Token rotating is successful", - "refresh_token_error_notification": "Token rotating error", - "refresh_token_confirm_title": "Rotate token", - "refresh_token_confirm_message": "Are you sure you want to rotate token?", - "refresh_token_button_label": "Rotate", - "validation": { - "user_name_format": "Only letters, numbers, - or _", - "email_format": "Incorrect email" - } - }, + "manual_payments": { + "title": "Credits history", + "add_payment": "Add payment", + "empty_message_title": "No payments", + "empty_message_text": "No payments to display.", - "manual_payments": { - "title": "Credits history", - "add_payment": "Add payment", - "empty_message_title": "No payments", - "empty_message_text": "No payments to display.", + "create": { + "success_notification": "Payment creating is successful" + }, - "create": { - "success_notification": "Payment creating is successful" - }, + "edit": { + "value": "Amount", + "value_description": "Enter amount here", + "description": "Description", + "description_description": "Describe payment here", + "created_at": "Created at" + } + }, - "edit": { - "value": "Amount", - "value_description": "Enter amount here", - "description": "Description", - "description_description": "Describe payment here", - "created_at": "Created at" - } + "token_copied": "Token copied" }, - - "token_copied": "Token copied" - }, - "billing": { - "title": "Billing", - "balance": "Balance", - "billing_history": "Billing history", - "payment_method": "Payment method", - "no_payment_method": "No payment method attached", - "top_up_balance": "Top up balance", - "edit_payment_method": "Edit payment method", - "payment_amount": "Payment amount", - "amount_description": "Minimum: ${{value}}", - "make_payment": "Make a payment", - "min_amount_error_message": "The amount is allowed to be more than {{value}}", - "payment_success_message": "Payment succeeded. There can be a short delay before the balance is updated." - }, - "validation": { - "required": "This is required field" - }, - "users_autosuggest": { - "placeholder": "Enter username or email to add member", - "entered_text": "Add member", - "loading": "Loading users", - "no_match": "No matches found" - }, - "roles": { - "admin": "Admin", - "manager": "Manager", - "user": "User" - }, - "confirm_dialog": { - "title": "Confirm delete", - "message": "Are you sure you want to delete?" - } + "billing": { + "title": "Billing", + "balance": "Balance", + "billing_history": "Billing history", + "payment_method": "Payment method", + "no_payment_method": "No payment method attached", + "top_up_balance": "Top up balance", + "edit_payment_method": "Edit payment method", + "payment_amount": "Payment amount", + "amount_description": "Minimum: ${{value}}", + "make_payment": "Make a payment", + "min_amount_error_message": "The amount is allowed to be more than {{value}}", + "payment_success_message": "Payment succeeded. There can be a short delay before the balance is updated." + }, + "validation": { + "required": "This is required field" + }, + "users_autosuggest": { + "placeholder": "Enter username or email to add member", + "entered_text": "Add member", + "loading": "Loading users", + "no_match": "No matches found" + }, + "roles": { + "admin": "Admin", + "manager": "Manager", + "user": "User" + }, + "confirm_dialog": { + "title": "Confirm delete", + "message": "Are you sure you want to delete?" + } } diff --git a/frontend/src/pages/Offers/List/hooks/useFilters.ts b/frontend/src/pages/Offers/List/hooks/useFilters.ts index ce93ca2853..20c95402c0 100644 --- a/frontend/src/pages/Offers/List/hooks/useFilters.ts +++ b/frontend/src/pages/Offers/List/hooks/useFilters.ts @@ -14,13 +14,15 @@ import { import { getPropertyFilterOptions } from '../helpers'; -type Args = { +type RequestParamsKeys = 'project_name' | 'gpu_name' | 'gpu_count' | 'gpu_memory' | 'backend' | 'spot_policy' | 'group_by'; + +export type UseFiltersArgs = { gpus: IGpu[]; withSearchParams?: boolean; + permanentFilters?: Partial>; + defaultFilters?: Partial>; }; -type RequestParamsKeys = 'project_name' | 'gpu_name' | 'gpu_count' | 'gpu_memory' | 'backend' | 'spot_policy' | 'group_by'; - export const filterKeys: Record = { PROJECT_NAME: 'project_name', GPU_NAME: 'gpu_name', @@ -30,7 +32,7 @@ export const filterKeys: Record = { SPOT_POLICY: 'spot_policy', }; -const multipleChoiseKeys: RequestParamsKeys[] = ['gpu_name', 'backend']; +const multipleChoiceKeys: RequestParamsKeys[] = ['gpu_name', 'backend']; const spotPolicyOptions = [ { @@ -47,19 +49,52 @@ const spotPolicyOptions = [ }, ]; +const filteringProperties = [ + { + key: filterKeys.PROJECT_NAME, + operators: ['='], + propertyLabel: 'Project', + }, + { + key: filterKeys.GPU_NAME, + operators: ['='], + propertyLabel: 'GPU name', + }, + { + key: filterKeys.GPU_COUNT, + operators: ['<=', '>='], + propertyLabel: 'GPU count', + }, + { + key: filterKeys.GPU_MEMORY, + operators: ['<=', '>='], + propertyLabel: 'GPU memory', + }, + { + key: filterKeys.BACKEND, + operators: ['='], + propertyLabel: 'Backend', + }, + { + key: filterKeys.SPOT_POLICY, + operators: ['='], + propertyLabel: 'Spot policy', + }, +]; + const gpuFilterOption = { label: 'GPU', value: 'gpu' }; const defaultGroupByOptions = [{ ...gpuFilterOption }, { label: 'Backend', value: 'backend' }]; const groupByRequestParamName: RequestParamsKeys = 'group_by'; -export const useFilters = ({ gpus, withSearchParams = true }: Args) => { +export const useFilters = ({ gpus, withSearchParams = true, permanentFilters = {}, defaultFilters }: UseFiltersArgs) => { const [searchParams, setSearchParams] = useSearchParams(); const { projectOptions } = useProjectFilter({ localStorePrefix: 'offers-list-projects' }); const projectNameIsChecked = useRef(false); const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => - requestParamsToTokens({ searchParams, filterKeys }), + requestParamsToTokens({ searchParams, filterKeys, defaultFilterValues: defaultFilters }), ); const [groupBy, setGroupBy] = useState(() => { @@ -133,6 +168,11 @@ export const useFilters = ({ gpus, withSearchParams = true }: Args) => { }); }, [groupBy]); + const filteringPropertiesForShowing = useMemo(() => { + const permanentFilterKeys = Object.keys(permanentFilters); + return filteringProperties.filter(({ key }) => !permanentFilterKeys.includes(key)); + }, [permanentFilters]); + const setSearchParamsHandle = ({ tokens, groupBy, @@ -151,43 +191,10 @@ export const useFilters = ({ gpus, withSearchParams = true }: Args) => { setSearchParams(searchParams); }; - const filteringProperties = [ - { - key: filterKeys.PROJECT_NAME, - operators: ['='], - propertyLabel: 'Project', - }, - { - key: filterKeys.GPU_NAME, - operators: ['='], - propertyLabel: 'GPU name', - }, - { - key: filterKeys.GPU_COUNT, - operators: ['<=', '>='], - propertyLabel: 'GPU count', - }, - { - key: filterKeys.GPU_MEMORY, - operators: ['<=', '>='], - propertyLabel: 'GPU memory', - }, - { - key: filterKeys.BACKEND, - operators: ['='], - propertyLabel: 'Backend', - }, - { - key: filterKeys.SPOT_POLICY, - operators: ['='], - propertyLabel: 'Spot policy', - }, - ]; - const onChangePropertyFilterHandle = ({ tokens, operation }: PropertyFilterProps.Query) => { const filteredTokens = tokens.filter((token, tokenIndex) => { return ( - multipleChoiseKeys.includes(token.propertyKey as RequestParamsKeys) || + multipleChoiceKeys.includes(token.propertyKey as RequestParamsKeys) || !tokens.some((item, index) => token.propertyKey === item.propertyKey && index > tokenIndex) ); }); @@ -227,13 +234,14 @@ export const useFilters = ({ gpus, withSearchParams = true }: Args) => { const filteringRequestParams = useMemo(() => { const params = tokensToRequestParams({ tokens: propertyFilterQuery.tokens, - arrayFieldKeys: multipleChoiseKeys, + arrayFieldKeys: multipleChoiceKeys, }); return { ...params, - } as Partial; - }, [propertyFilterQuery]); + ...permanentFilters, + }; + }, [propertyFilterQuery, permanentFilters]); useEffect(() => { if (!projectNameIsChecked.current && projectOptions.length) { @@ -261,7 +269,7 @@ export const useFilters = ({ gpus, withSearchParams = true }: Args) => { propertyFilterQuery, onChangePropertyFilter, filteringOptions, - filteringProperties, + filteringProperties: filteringPropertiesForShowing, groupBy, groupByOptions, onChangeGroupBy, diff --git a/frontend/src/pages/Offers/List/index.tsx b/frontend/src/pages/Offers/List/index.tsx index edf747d251..59d1bb5c7f 100644 --- a/frontend/src/pages/Offers/List/index.tsx +++ b/frontend/src/pages/Offers/List/index.tsx @@ -7,7 +7,7 @@ import { useCollection } from 'hooks'; import { useGetGpusListQuery } from 'services/gpu'; import { useEmptyMessages } from './hooks/useEmptyMessages'; -import { useFilters } from './hooks/useFilters'; +import { useFilters, UseFiltersArgs } from './hooks/useFilters'; import { convertMiBToGB, rangeToObject, renderRange, renderRangeJSX, round } from './helpers'; import styles from './styles.module.scss'; @@ -66,20 +66,32 @@ const getRequestParams = ({ }; }; -type OfferListProps = Pick & { - withSearchParams?: boolean; - onChangeProjectName?: (value: string) => void; -}; +type OfferListProps = Pick & + Pick & { + withSearchParams?: boolean; + disabled?: boolean; + onChangeProjectName?: (value: string) => void; + onChangeBackendFilter?: (backends: string[]) => void; + }; -export const OfferList: React.FC = ({ withSearchParams, onChangeProjectName, ...props }) => { +export const OfferList: React.FC = ({ + withSearchParams, + disabled, + onChangeProjectName, + onChangeBackendFilter, + permanentFilters, + defaultFilters, + ...props +}) => { const { t } = useTranslation(); const [requestParams, setRequestParams] = useState(); + const { data, isLoading, isFetching } = useGetGpusListQuery( // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-expect-error requestParams, { - skip: !requestParams || !requestParams['project_name'] || !requestParams['group_by']?.length, + skip: disabled || !requestParams || !requestParams['project_name'] || !requestParams['group_by']?.length, }, ); @@ -93,7 +105,7 @@ export const OfferList: React.FC = ({ withSearchParams, onChange groupBy, groupByOptions, onChangeGroupBy, - } = useFilters({ gpus: data?.gpus ?? [], withSearchParams }); + } = useFilters({ gpus: data?.gpus ?? [], withSearchParams, permanentFilters, defaultFilters }); useEffect(() => { setRequestParams( @@ -110,6 +122,11 @@ export const OfferList: React.FC = ({ withSearchParams, onChange onChangeProjectName?.(filteringRequestParams.project_name ?? ''); }, [filteringRequestParams.project_name]); + useEffect(() => { + const backend = filteringRequestParams.backend; + onChangeBackendFilter?.(backend ? (Array.isArray(backend) ? backend : [backend]) : []); + }, [filteringRequestParams.backend]); + const { renderEmptyMessage, renderNoMatchMessage } = useEmptyMessages({ clearFilter, projectNameSelected: Boolean(requestParams?.['project_name']), @@ -200,15 +217,16 @@ export const OfferList: React.FC = ({ withSearchParams, onChange {...collectionProps} {...props} entireCardClickable - items={items} + items={disabled ? [] : items} + empty={disabled ? ' ' : undefined} cardDefinition={{ header: (gpu) => gpu.name, sections, }} - loading={isLoading || isFetching} + loading={!disabled && (isLoading || isFetching)} loadingText={t('common.loading')} stickyHeader={true} - filter={ + filter={disabled ? undefined : (
    = ({ withSearchParams, onChange />
    - } + )} /> ); }; diff --git a/frontend/src/pages/Runs/CreateDevEnvironment/constants.tsx b/frontend/src/pages/Runs/CreateDevEnvironment/constants.tsx deleted file mode 100644 index 98955d6a50..0000000000 --- a/frontend/src/pages/Runs/CreateDevEnvironment/constants.tsx +++ /dev/null @@ -1,50 +0,0 @@ -import React from 'react'; - -import { IRunEnvironmentFormKeys } from './types'; -export const CONFIG_INFO = { - header:

    Credits history

    , - body: ( - <> -

    Available for only the global admin role

    - - ), -}; - -export const FORM_FIELD_NAMES = { - offer: 'offer', - name: 'name', - ide: 'ide', - config_yaml: 'config_yaml', - docker: 'docker', - image: 'image', - python: 'python', - repo_enabled: 'repo_enabled', - repo_url: 'repo_url', - repo_path: 'repo_path', - working_dir: 'working_dir', -} as const satisfies Record; - -export const IDE_OPTIONS = [ - { - label: 'Cursor', - value: 'cursor', - }, - { - label: 'VS Code', - value: 'vscode', - }, - { - label: 'Windsurf', - value: 'windsurf', - }, -] as const; - -export const IDE_DISPLAY_NAMES: Record = { - cursor: 'Cursor', - vscode: 'VS Code', - windsurf: 'Windsurf', -}; - -export const getIDEDisplayName = (ide: string): string => { - return IDE_DISPLAY_NAMES[ide] || 'IDE'; -}; diff --git a/frontend/src/pages/Runs/CreateDevEnvironment/hooks/useGenerateYaml.ts b/frontend/src/pages/Runs/CreateDevEnvironment/hooks/useGenerateYaml.ts deleted file mode 100644 index a693985fc0..0000000000 --- a/frontend/src/pages/Runs/CreateDevEnvironment/hooks/useGenerateYaml.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { useMemo } from 'react'; -import jsYaml from 'js-yaml'; - -import { convertMiBToGB, renderRange, round } from 'pages/Offers/List/helpers'; - -import { IRunEnvironmentFormValues } from '../types'; - -export type UseGenerateYamlArgs = { - formValues: IRunEnvironmentFormValues; -}; - -export const useGenerateYaml = ({ formValues }: UseGenerateYamlArgs) => { - return useMemo(() => { - if (!formValues.offer || !formValues.ide) { - return ''; - } - - const { name, ide, image, python, offer, docker, repo_url, repo_path, working_dir } = formValues; - - return jsYaml.dump({ - type: 'dev-environment', - ...(name ? { name } : {}), - ide, - ...(docker ? { docker } : {}), - ...(image ? { image } : {}), - ...(python ? { python } : {}), - - resources: { - gpu: `${offer.name}:${round(convertMiBToGB(offer.memory_mib))}GB:${renderRange(offer.count)}`, - }, - - ...(repo_url || repo_path - ? { - repos: [[repo_url?.trim(), repo_path?.trim()].filter(Boolean).join(':')], - } - : {}), - - ...(working_dir ? { working_dir } : {}), - backends: offer.backends, - spot_policy: 'auto', - }); - }, [ - formValues.name, - formValues.ide, - formValues.offer, - formValues.python, - formValues.image, - formValues.repo_url, - formValues.repo_path, - formValues.working_dir, - ]); -}; diff --git a/frontend/src/pages/Runs/CreateDevEnvironment/index.tsx b/frontend/src/pages/Runs/CreateDevEnvironment/index.tsx deleted file mode 100644 index fcac5194af..0000000000 --- a/frontend/src/pages/Runs/CreateDevEnvironment/index.tsx +++ /dev/null @@ -1,457 +0,0 @@ -import React, { useCallback, useEffect, useState } from 'react'; -import { useForm } from 'react-hook-form'; -import { useTranslation } from 'react-i18next'; -import { useNavigate, useSearchParams } from 'react-router-dom'; -import cn from 'classnames'; -import * as yup from 'yup'; -import { Box, Link, WizardProps } from '@cloudscape-design/components'; -import { CardsProps } from '@cloudscape-design/components/cards'; - -import { TabsProps, ToggleProps } from 'components'; -import { Container, FormCodeEditor, FormField, FormInput, FormSelect, SpaceBetween, Tabs, Toggle, Wizard } from 'components'; - -import { useBreadcrumbs, useNotifications } from 'hooks'; -import { useCheckingForFleetsInProjects } from 'hooks/useCheckingForFleetsInProjectsOfMember'; -import { getServerError } from 'libs'; -import { ROUTES } from 'routes'; -import { useApplyRunMutation } from 'services/run'; - -import { OfferList } from 'pages/Offers/List'; -import { NoFleetProjectAlert } from 'pages/Project/components/NoFleetProjectAlert'; - -import { useGenerateYaml } from './hooks/useGenerateYaml'; -import { useGetRunSpecFromYaml } from './hooks/useGetRunSpecFromYaml'; -import { FORM_FIELD_NAMES, IDE_OPTIONS } from './constants'; - -import { IRunEnvironmentFormKeys, IRunEnvironmentFormValues } from './types'; - -import styles from './styles.module.scss'; - -const requiredFieldError = 'This is a required field'; -const namesFieldError = 'Only latin characters, dashes, and digits'; -const urlFormatError = 'Only URLs'; -const workingDirFormatError = 'Must be an absolute path'; - -enum DockerPythonTabs { - DOCKER = 'docker', - PYTHON = 'python', -} - -const envValidationSchema = yup.object({ - offer: yup.object().required(requiredFieldError), - name: yup.string().matches(/^[a-z][a-z0-9-]{1,40}$/, namesFieldError), - ide: yup.string().required(requiredFieldError), - config_yaml: yup.string().required(requiredFieldError), - working_dir: yup.string().matches(/^\//, workingDirFormatError), - - image: yup.string().when('docker', { - is: true, - then: yup.string().required(requiredFieldError), - }), - - repo_url: yup.string().when('repo_enabled', { - is: true, - then: yup - .string() - // eslint-disable-next-line no-useless-escape - .matches(/^(https?):\/\/([^\s\/?#]+)((?:\/[^\s?#]*)*)(?::\/(.*))?$/i, urlFormatError) - .required(requiredFieldError), - }), -}); - -// eslint-disable-next-line @typescript-eslint/ban-ts-comment -// @ts-expect-error -const useYupValidationResolver = (validationSchema) => - useCallback( - async (data: IRunEnvironmentFormValues) => { - try { - const values = await validationSchema.validate(data, { - abortEarly: false, - }); - - return { - values, - errors: {}, - }; - } catch (errors) { - return { - values: {}, - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-expect-error - errors: errors.inner.reduce( - // eslint-disable-next-line @typescript-eslint/ban-ts-comment - // @ts-expect-error - (allErrors, currentError) => ({ - ...allErrors, - [currentError.path]: { - type: currentError.type ?? 'validation', - message: currentError.message, - }, - }), - {}, - ), - }; - } - }, - [validationSchema], - ); - -export const CreateDevEnvironment: React.FC = () => { - const { t } = useTranslation(); - const [searchParams] = useSearchParams(); - const navigate = useNavigate(); - const [pushNotification] = useNotifications(); - const [activeStepIndex, setActiveStepIndex] = useState(0); - const [selectedOffers, setSelectedOffers] = useState([]); - const [selectedProject, setSelectedProject] = useState( - () => searchParams.get('project_name') ?? null, - ); - - const [getRunSpecFromYaml] = useGetRunSpecFromYaml({ projectName: selectedProject ?? '' }); - - const projectHavingFleetMap = useCheckingForFleetsInProjects({ projectNames: selectedProject ? [selectedProject] : [] }); - const projectDontHasFleets = !!selectedProject && !projectHavingFleetMap[selectedProject]; - - const [applyRun, { isLoading: isApplying }] = useApplyRunMutation(); - - const loading = isApplying; - - useBreadcrumbs([ - { - text: t('projects.runs'), - href: ROUTES.RUNS.LIST, - }, - { - text: t('runs.dev_env.wizard.title'), - href: ROUTES.RUNS.CREATE_DEV_ENV, - }, - ]); - - const resolver = useYupValidationResolver(envValidationSchema); - const formMethods = useForm({ - resolver, - defaultValues: { - ide: 'cursor', - docker: false, - repo_enabled: false, - }, - }); - const { handleSubmit, control, trigger, setValue, watch, formState, getValues } = formMethods; - const formValues = watch(); - - const onCancelHandler = () => { - navigate(ROUTES.RUNS.LIST); - }; - - const validateOffer = async () => { - return await trigger(['offer']); - }; - - const validateSecondStep = async () => { - const secondStepFields = Object.keys(FORM_FIELD_NAMES).filter( - (fieldName) => !['offer', 'config_yaml'].includes(fieldName), - ) as IRunEnvironmentFormKeys[]; - - return await trigger(secondStepFields); - }; - - const validateConfig = async () => { - return await trigger(['config_yaml']); - }; - - const onNavigate = ({ - requestedStepIndex, - reason, - }: { - requestedStepIndex: number; - reason: WizardProps.NavigationReason; - }) => { - const stepValidators = [validateOffer, validateSecondStep, validateConfig]; - - if (reason === 'next') { - if (projectDontHasFleets) { - window.scrollTo(0, 0); - } - - stepValidators[activeStepIndex]?.().then((isValid) => { - if (isValid) { - setActiveStepIndex(requestedStepIndex); - } else if (activeStepIndex == 0) { - window.scrollTo(0, 0); - } - }); - } else { - setActiveStepIndex(requestedStepIndex); - } - }; - - const onNavigateHandler: WizardProps['onNavigate'] = ({ detail: { requestedStepIndex, reason } }) => { - onNavigate({ requestedStepIndex, reason }); - }; - - const toggleRepo: ToggleProps['onChange'] = ({ detail }) => { - setValue('repo_enabled', detail.checked); - - if (!detail.checked) { - setValue('repo_url', ''); - setValue('repo_path', ''); - } - }; - - const onChangeTab: TabsProps['onChange'] = ({ detail }) => { - if (detail.activeTabId === DockerPythonTabs.DOCKER) { - setValue('python', ''); - } - - if (detail.activeTabId === DockerPythonTabs.PYTHON) { - setValue('image', ''); - } - - setValue('docker', detail.activeTabId === DockerPythonTabs.DOCKER); - }; - - const onChangeOffer: CardsProps['onSelectionChange'] = ({ detail }) => { - const newSelectedOffers = detail?.selectedItems ?? []; - setSelectedOffers(newSelectedOffers); - setValue('offer', newSelectedOffers?.[0] ?? null); - }; - - const onSubmitWizard = async () => { - const isValid = await trigger(); - - if (!isValid) { - return; - } - - const { config_yaml } = getValues(); - - let runSpec; - - try { - runSpec = await getRunSpecFromYaml(config_yaml); - } catch (e) { - console.log('parse transaction error:', e); - return; - } - - const requestParams: TRunApplyRequestParams = { - project_name: selectedProject ?? '', - plan: { - run_spec: runSpec, - }, - force: false, - }; - - applyRun(requestParams) - .unwrap() - .then((data) => { - pushNotification({ - type: 'success', - content: t('runs.dev_env.wizard.success_notification'), - }); - - navigate(ROUTES.PROJECT.DETAILS.RUNS.DETAILS.FORMAT(data.project_name, data.id)); - }) - .catch((error) => { - pushNotification({ - type: 'error', - content: t('common.server_error', { error: getServerError(error) }), - }); - }); - }; - - const onSubmit = () => { - if (activeStepIndex < 3) { - onNavigate({ requestedStepIndex: activeStepIndex + 1, reason: 'next' }); - } else { - onSubmitWizard().catch(console.log); - } - }; - - const yaml = useGenerateYaml({ formValues }); - - useEffect(() => { - setValue('config_yaml', yaml); - }, [yaml]); - - return ( - - - - `Step ${stepNumber}`, - navigationAriaLabel: 'Steps', - cancelButton: t('common.cancel'), - previousButton: t('common.previous'), - nextButton: t('common.next'), - optional: 'optional', - }} - onCancel={onCancelHandler} - submitButtonText={t('runs.dev_env.wizard.submit')} - steps={[ - { - title: 'Resources', - content: ( - <> - - {formState.errors.offer?.message &&
    } - setSelectedProject(projectName)} - selectionType="single" - withSearchParams={false} - selectedItems={selectedOffers} - onSelectionChange={onChangeOffer} - /> - - ), - }, - - { - title: 'Settings', - content: ( - - - - - - - - - - ), - }, - { - label: t('runs.dev_env.wizard.docker'), - id: DockerPythonTabs.DOCKER, - content: ( -
    - -
    - ), - }, - ]} - /> - - - - - {t('runs.dev_env.wizard.repo')} - - - {formValues.repo_enabled && ( - <> - - - - - )} -
    -
    - ), - }, - - { - title: 'Configuration', - content: ( - - - Review the configuration file and adjust it if needed. See{' '} - - examples - - . - - } - name="config_yaml" - language="yaml" - loading={loading} - editorContentHeight={600} - /> - - ), - }, - ]} - /> - - ); -}; diff --git a/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx b/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx index b2751253ab..af63e9c67b 100644 --- a/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx +++ b/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx @@ -19,7 +19,7 @@ import { import { copyToClipboard } from 'libs'; import { useConfigProjectCliCommand } from 'pages/Project/hooks/useConfigProjectCliComand'; -import { getIDEDisplayName } from 'pages/Runs/CreateDevEnvironment/constants'; +import { getIDEDisplayName } from 'pages/Runs/Launch/constants'; import styles from './styles.module.scss'; diff --git a/frontend/src/pages/Runs/Details/RunDetails/ConnectToServiceRun/index.tsx b/frontend/src/pages/Runs/Details/RunDetails/ConnectToServiceRun/index.tsx new file mode 100644 index 0000000000..032e876a57 --- /dev/null +++ b/frontend/src/pages/Runs/Details/RunDetails/ConnectToServiceRun/index.tsx @@ -0,0 +1,47 @@ +import React, { FC } from 'react'; + +import { Alert, Box, Container, Header, Link, SpaceBetween } from 'components'; + +import { getRunProbeStatuses } from 'libs/run'; + +import { getRunListItemServiceUrl } from '../../../List/helpers'; + +export const ConnectToServiceRun: FC<{ run: IRun }> = ({ run }) => { + const serviceUrl = getRunListItemServiceUrl(run); + const probeStatuses = getRunProbeStatuses(run); + const hasProbes = probeStatuses.length > 0; + const allProbesReady = hasProbes && probeStatuses.every((s) => s === 'success'); + const serviceReady = run.status === 'running' && (!hasProbes || allProbesReady) && serviceUrl; + + return ( + +
    Endpoint
    + + {run.status !== 'running' && ( + + + Waiting for the service to start. + + )} + + {run.status === 'running' && !serviceReady && ( + + + Waiting for the service to become ready. + + )} + + {serviceReady && ( + + + + The service is ready at{' '} + + {serviceUrl} + + + + )} +
    + ); +}; diff --git a/frontend/src/pages/Runs/Details/RunDetails/index.tsx b/frontend/src/pages/Runs/Details/RunDetails/index.tsx index f878f367ff..e899ad0bc3 100644 --- a/frontend/src/pages/Runs/Details/RunDetails/index.tsx +++ b/frontend/src/pages/Runs/Details/RunDetails/index.tsx @@ -29,12 +29,12 @@ import { getRunListItemRegion, getRunListItemResources, getRunListItemSchedule, - getRunListItemServiceUrl, getRunListItemSpot, } from '../../List/helpers'; import { EventsList } from '../Events/List'; import { JobList } from '../Jobs/List'; import { ConnectToRunWithDevEnvConfiguration } from './ConnectToRunWithDevEnvConfiguration'; +import { ConnectToServiceRun } from './ConnectToServiceRun'; export const RunDetails = () => { const { t } = useTranslation(); @@ -47,7 +47,6 @@ export const RunDetails = () => { id: paramRunId, }); - const serviceUrl = runData ? getRunListItemServiceUrl(runData) : null; const schedule = runData ? getRunListItemSchedule(runData) : null; const nextTriggeredAt = runData ? runData.next_triggered_at : null; @@ -191,17 +190,6 @@ export const RunDetails = () => { - {serviceUrl && ( - -
    - {t('projects.run.service_url')} - -
    -
    - )} - {schedule && (
    @@ -220,6 +208,10 @@ export const RunDetails = () => { )} + {runData.run_spec.configuration.type === 'service' && !runIsStopped(runData.status) && ( + + )} + {runData.jobs.length > 1 && ( ; + loading?: boolean; + template?: ITemplate; +}; + +enum DockerPythonTabs { + DOCKER = 'docker', + PYTHON = 'python', +} + +export const ParamsWizardStep: React.FC = ({ formMethods, template, loading }) => { + const { t } = useTranslation(); + const { control, setValue, watch, getValues } = formMethods; + const [openHelpPanel] = useHelpPanel(); + + const [dockerPythonTab, setDockerPythonTab] = React.useState(() => { + if (getValues(FORM_FIELD_NAMES.image)) { + return DockerPythonTabs.DOCKER; + } + return DockerPythonTabs.PYTHON; + }); + + const isEnabledRepo = Boolean(watch(FORM_FIELD_NAMES.repo_enabled)); + + const toggleRepo: ToggleProps['onChange'] = ({ detail }) => { + if (!detail.checked) { + setValue(FORM_FIELD_NAMES.repo_url, ''); + setValue(FORM_FIELD_NAMES.repo_path, ''); + } + }; + + const onChangeTab: TabsProps['onChange'] = ({ detail }) => { + setDockerPythonTab(detail.activeTabId as DockerPythonTabs); + + if (detail.activeTabId === DockerPythonTabs.DOCKER) { + setValue(FORM_FIELD_NAMES.python, ''); + } + + if (detail.activeTabId === DockerPythonTabs.PYTHON) { + setValue(FORM_FIELD_NAMES.image, ''); + } + + setValue(FORM_FIELD_NAMES.docker, detail.activeTabId === DockerPythonTabs.DOCKER); + }; + + const defaultPassword = generateSecurePassword(20); + + const paramsMap = useMemo>(() => { + if (!template) { + return new Map(); + } + + return new Map(template.parameters.map((parameter) => [parameter.type, parameter])); + }, [template]); + + const renderName = () => { + if (!paramsMap.get('name')) { + return null; + } + + return ( + + ); + }; + + const copyPassword = () => { + copyToClipboard(getValues(FORM_FIELD_NAMES.password) ?? ''); + setValue(FORM_FIELD_NAMES.password_copied, true, { shouldValidate: true }); + }; + + const renderIde = () => { + if (!paramsMap.get('ide')) { + return null; + } + + return ( + + ); + }; + + const renderPythonOrDocker = () => { + if (!paramsMap.get('python_or_docker')) { + return null; + } + + return ( + + +
    + ), + }, + { + label: t('runs.launch.wizard.docker'), + id: DockerPythonTabs.DOCKER, + content: ( +
    + +
    + ), + }, + ]} + /> + ); + }; + + const renderWorkingDir = () => { + if (!paramsMap.get('working_dir')) { + return null; + } + + return ( + + ); + }; + + const renderRepo = () => { + if (!paramsMap.get('repo')) { + return null; + } + + return ( + + + + {isEnabledRepo && ( + + + + + + )} + + ); + }; + + const renderEnv = () => { + const envParameter = paramsMap.get('env'); + + if (!envParameter) { + return null; + } + + const isRandomPassword = envParameter.value === '$random-password'; + + if (isRandomPassword) { + return ( + openHelpPanel(PASSWORD_INFO)} />} + control={control} + name={FORM_FIELD_NAMES.password} + defaultValue={defaultPassword} + type="password" + disabled={loading} + secondaryControl={ + Password copied} + > +
    {t('common.full_view')}}> + {t('navigation.events')} + + } + footer={} + /> + ); +}; diff --git a/frontend/src/pages/Instances/Details/Inspect/index.tsx b/frontend/src/pages/Instances/Details/Inspect/index.tsx new file mode 100644 index 0000000000..a9c9ac2594 --- /dev/null +++ b/frontend/src/pages/Instances/Details/Inspect/index.tsx @@ -0,0 +1,69 @@ +import React, { useEffect, useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; + +import { CodeEditor, Container, Header, Loader } from 'components'; + +import { useGetInstanceDetailsQuery } from 'services/instance'; + +interface AceEditorElement extends HTMLElement { + env?: { + editor?: { + setReadOnly: (readOnly: boolean) => void; + }; + }; +} + +export const InstanceInspect = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramProjectName = params.projectName ?? ''; + const paramInstanceId = params.instanceId ?? ''; + + const { data, isLoading } = useGetInstanceDetailsQuery( + { + projectName: paramProjectName, + instanceId: paramInstanceId, + }, + { + refetchOnMountOrArgChange: true, + }, + ); + + const jsonContent = useMemo(() => { + if (!data) return ''; + return JSON.stringify(data, null, 2); + }, [data]); + + useEffect(() => { + const timer = setTimeout(() => { + const editorElements = document.querySelectorAll('.ace_editor'); + editorElements.forEach((element: Element) => { + const aceEditor = (element as AceEditorElement).env?.editor; + if (aceEditor) { + aceEditor.setReadOnly(true); + } + }); + }, 100); + + return () => clearTimeout(timer); + }, [jsonContent]); + + if (isLoading) + return ( + + + + ); + + return ( + {t('fleets.instances.inspect')}}> + {}} + /> + + ); +}; diff --git a/frontend/src/pages/Instances/Details/InstanceDetails/index.tsx b/frontend/src/pages/Instances/Details/InstanceDetails/index.tsx new file mode 100644 index 0000000000..408698b777 --- /dev/null +++ b/frontend/src/pages/Instances/Details/InstanceDetails/index.tsx @@ -0,0 +1,161 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; +import { format } from 'date-fns'; + +import { Box, ColumnLayout, Container, Header, Loader, NavigateLink, StatusIndicator } from 'components'; + +import { DATE_TIME_FORMAT } from 'consts'; +import { formatBackend, getStatusIconType } from 'libs/fleet'; +import { getHealthStatusIconType, prettyEnumValue } from 'libs/instance'; +import { formatResources } from 'libs/resources'; +import { ROUTES } from 'routes'; +import { useGetInstanceDetailsQuery } from 'services/instance'; + +export const InstanceDetails = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramInstanceId = params.instanceId ?? ''; + const paramProjectName = params.projectName ?? ''; + + const { data, isLoading } = useGetInstanceDetailsQuery( + { + projectName: paramProjectName, + instanceId: paramInstanceId, + }, + { + refetchOnMountOrArgChange: true, + }, + ); + + return ( + <> + {isLoading && ( + + + + )} + + {data && ( + {t('common.general')}}> + +
    + {t('fleets.instances.project')} +
    + + {data.project_name} + +
    +
    + +
    + {t('fleets.fleet')} +
    + {data.fleet_name && data.fleet_id ? ( + + {data.fleet_name} + + ) : ( + '-' + )} +
    +
    + +
    + {t('fleets.instances.status')} +
    + + {(data.status === 'idle' || data.status === 'busy') && + data.total_blocks !== null && + data.total_blocks > 1 + ? `${data.busy_blocks}/${data.total_blocks} Busy` + : prettyEnumValue(data.status)} + +
    +
    + +
    + {t('projects.run.error')} +
    + {data.unreachable ? ( + Unreachable + ) : data.health_status !== 'healthy' ? ( + + {prettyEnumValue(data.health_status)} + + ) : ( + '-' + )} +
    +
    + +
    + {t('fleets.instances.started')} +
    {format(new Date(data.created), DATE_TIME_FORMAT)}
    +
    + +
    + {t('fleets.instances.finished_at')} +
    + {data.finished_at ? format(new Date(data.finished_at), DATE_TIME_FORMAT) : '-'} +
    +
    + + {data.termination_reason && ( +
    + {t('fleets.instances.termination_reason')} +
    + {data.termination_reason_message ?? prettyEnumValue(data.termination_reason)} +
    +
    + )} + +
    + {t('fleets.instances.resources')} +
    {data.instance_type ? formatResources(data.instance_type.resources) : '-'}
    +
    + +
    + {t('fleets.instances.backend')} +
    {formatBackend(data.backend)}
    +
    + +
    + {t('fleets.instances.region')} +
    {data.region ?? '-'}
    +
    + +
    + {t('fleets.instances.instance_type')} +
    {data.instance_type?.name ?? '-'}
    +
    + +
    + {t('fleets.instances.spot')} +
    {data.instance_type?.resources.spot ? t('common.yes') : t('common.no')}
    +
    + +
    + {t('fleets.instances.price')} +
    {typeof data.price === 'number' ? `$${data.price}` : '-'}
    +
    + + {data.total_blocks !== null && ( +
    + {t('fleets.instances.blocks')} +
    {data.total_blocks}
    +
    + )} + +
    + {t('fleets.instances.hostname')} +
    {data.hostname ?? '-'}
    +
    +
    +
    + )} + + ); +}; diff --git a/frontend/src/pages/Instances/Details/index.tsx b/frontend/src/pages/Instances/Details/index.tsx new file mode 100644 index 0000000000..2c7f1a2b5c --- /dev/null +++ b/frontend/src/pages/Instances/Details/index.tsx @@ -0,0 +1,110 @@ +import React from 'react'; +import { useTranslation } from 'react-i18next'; +import { Outlet, useNavigate, useParams } from 'react-router-dom'; + +import { Button, ContentLayout, DetailsHeader, Tabs } from 'components'; + +enum InstanceTab { + Details = 'details', + Events = 'events', + Inspect = 'inspect', +} + +import { useBreadcrumbs } from 'hooks'; +import { ROUTES } from 'routes'; +import { useGetInstanceDetailsQuery } from 'services/instance'; + +import { useDeleteInstance } from './useDeleteInstance'; + +import styles from './styles.module.scss'; + +export const InstanceDetailsPage: React.FC = () => { + const { t } = useTranslation(); + const params = useParams(); + const paramInstanceId = params.instanceId ?? ''; + const paramProjectName = params.projectName ?? ''; + const navigate = useNavigate(); + + const { deleteInstance, isDeleting } = useDeleteInstance(); + + const { data } = useGetInstanceDetailsQuery( + { + projectName: paramProjectName, + instanceId: paramInstanceId, + }, + { + refetchOnMountOrArgChange: true, + }, + ); + + useBreadcrumbs([ + { + text: t('navigation.project_other'), + href: ROUTES.PROJECT.LIST, + }, + { + text: paramProjectName, + href: ROUTES.PROJECT.DETAILS.FORMAT(paramProjectName), + }, + { + text: t('navigation.instances'), + href: ROUTES.INSTANCES.LIST, + }, + { + text: data?.name ?? '', + href: ROUTES.INSTANCES.DETAILS.FORMAT(paramProjectName, paramInstanceId), + }, + ]); + + const deleteClickHandle = () => { + if (!data) return; + + deleteInstance(data) + .then(() => { + navigate(ROUTES.INSTANCES.LIST); + }) + .catch(console.log); + }; + + const isDisabledDeleteButton = !data || isDeleting || data.status === 'terminated'; + + return ( +
    + + {t('common.delete')} + + } + /> + } + > + + + + +
    + ); +}; diff --git a/frontend/src/pages/Instances/Details/styles.module.scss b/frontend/src/pages/Instances/Details/styles.module.scss new file mode 100644 index 0000000000..1a7d41a9c5 --- /dev/null +++ b/frontend/src/pages/Instances/Details/styles.module.scss @@ -0,0 +1,18 @@ +.page { + height: 100%; + + & [class^="awsui_tabs-content"] { + display: none; + } + + & > [class^="awsui_layout"] { + height: 100%; + + & > [class^="awsui_content"] { + display: flex; + flex-direction: column; + gap: 20px; + height: 100%; + } + } +} diff --git a/frontend/src/pages/Instances/Details/useDeleteInstance.ts b/frontend/src/pages/Instances/Details/useDeleteInstance.ts new file mode 100644 index 0000000000..b460ed4373 --- /dev/null +++ b/frontend/src/pages/Instances/Details/useDeleteInstance.ts @@ -0,0 +1,38 @@ +import { useCallback, useState } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { useNotifications } from 'hooks'; +import { getServerError } from 'libs'; +import { useDeleteInstancesMutation } from 'services/instance'; + +export const useDeleteInstance = () => { + const { t } = useTranslation(); + const [deleteInstances] = useDeleteInstancesMutation(); + const [pushNotification] = useNotifications(); + const [isDeleting, setIsDeleting] = useState(false); + + const deleteInstance = useCallback(async (instance: IInstance) => { + if (!instance.project_name || !instance.fleet_name) { + return Promise.reject('Missing project or fleet name'); + } + + setIsDeleting(true); + + return deleteInstances({ + projectName: instance.project_name, + fleetName: instance.fleet_name, + instancesNums: [instance.instance_num], + }) + .unwrap() + .finally(() => setIsDeleting(false)) + .catch((error) => { + pushNotification({ + type: 'error', + content: t('common.server_error', { error: getServerError(error) }), + }); + throw error; + }); + }, []); + + return { deleteInstance, isDeleting } as const; +}; diff --git a/frontend/src/pages/Instances/List/hooks/useColumnDefinitions.tsx b/frontend/src/pages/Instances/List/hooks/useColumnDefinitions.tsx index 941f87c65e..c00834fb23 100644 --- a/frontend/src/pages/Instances/List/hooks/useColumnDefinitions.tsx +++ b/frontend/src/pages/Instances/List/hooks/useColumnDefinitions.tsx @@ -5,14 +5,25 @@ import { format } from 'date-fns'; import { Icon, NavigateLink, StatusIndicator, TableProps } from 'components'; import { DATE_TIME_FORMAT } from 'consts'; -import { getStatusIconType } from 'libs/fleet'; - -import { ROUTES } from '../../../../routes'; +import { formatBackend, getStatusIconType } from 'libs/fleet'; +import { formatInstanceStatusText, getHealthStatusIconType, prettyEnumValue } from 'libs/instance'; +import { formatResources } from 'libs/resources'; +import { ROUTES } from 'routes'; export const useColumnsDefinitions = () => { const { t } = useTranslation(); const columns: TableProps.ColumnDefinition[] = [ + { + id: 'name', + header: t('fleets.instances.instance_name'), + cell: (item) => + item.project_name ? ( + {item.name} + ) : ( + item.name + ), + }, { id: 'fleet_name', header: t('fleets.fleet'), @@ -25,11 +36,6 @@ export const useColumnsDefinitions = () => { '-' ), }, - { - id: 'instance_num', - header: t('fleets.instances.instance_num'), - cell: (item) => item.instance_num, - }, { id: 'project_name', header: t('fleets.instances.project'), @@ -40,6 +46,27 @@ export const useColumnsDefinitions = () => { item.project_name ), }, + { + id: 'status', + header: t('fleets.instances.status'), + cell: (item) => ( + {formatInstanceStatusText(item)} + ), + }, + { + id: 'error', + header: t('projects.run.error'), + cell: (item) => { + if (item.unreachable) return Unreachable; + if (item.health_status !== 'healthy') + return ( + + {prettyEnumValue(item.health_status)} + + ); + return null; + }, + }, { id: 'hostname', header: t('fleets.instances.hostname'), @@ -48,7 +75,12 @@ export const useColumnsDefinitions = () => { { id: 'backend', header: t('fleets.instances.backend'), - cell: (item) => item.backend, + cell: (item) => formatBackend(item.backend), + }, + { + id: 'price', + header: t('fleets.instances.price'), + cell: (item) => (typeof item.price === 'number' ? `$${item.price}` : '-'), }, { id: 'region', @@ -63,31 +95,22 @@ export const useColumnsDefinitions = () => { { id: 'resources', header: t('fleets.instances.resources'), - cell: (item) => item.instance_type?.resources.description ?? '-', + cell: (item) => (item.instance_type ? formatResources(item.instance_type.resources) : '-'), }, { id: 'spot', header: t('fleets.instances.spot'), cell: (item) => item.instance_type?.resources.spot && , }, - { - id: 'status', - header: t('fleets.instances.status'), - cell: (item) => ( - - {t(`fleets.instances.statuses.${item.status}`)} - - ), - }, { id: 'started', header: t('fleets.instances.started'), cell: (item) => format(new Date(item.created), DATE_TIME_FORMAT), }, { - id: 'price', - header: t('fleets.instances.price'), - cell: (item) => (typeof item.price === 'number' ? `$${item.price}` : '-'), + id: 'finished_at', + header: t('fleets.instances.finished_at'), + cell: (item) => (item.finished_at ? format(new Date(item.finished_at), DATE_TIME_FORMAT) : '-'), }, ]; diff --git a/frontend/src/pages/Instances/index.ts b/frontend/src/pages/Instances/index.ts index ee1bcdcc2d..15199c378d 100644 --- a/frontend/src/pages/Instances/index.ts +++ b/frontend/src/pages/Instances/index.ts @@ -1 +1,2 @@ export { List as InstanceList } from './List'; +export { InstanceDetailsPage } from './Details'; diff --git a/frontend/src/pages/Runs/Details/Jobs/List/helpers.ts b/frontend/src/pages/Runs/Details/Jobs/List/helpers.ts index cb5ef5a468..6fd1f30778 100644 --- a/frontend/src/pages/Runs/Details/Jobs/List/helpers.ts +++ b/frontend/src/pages/Runs/Details/Jobs/List/helpers.ts @@ -3,9 +3,12 @@ import type { StatusIndicatorProps } from '@cloudscape-design/components/status- import { DATE_TIME_FORMAT } from 'consts'; import { capitalize } from 'libs'; +import { formatBackend } from 'libs/fleet'; +import { formatResources } from 'libs/resources'; export const getJobListItemResources = (job: IJob) => { - return job.job_submissions?.[job.job_submissions.length - 1]?.job_provisioning_data?.instance_type?.resources?.description; + const resources = job.job_submissions?.[job.job_submissions.length - 1]?.job_provisioning_data?.instance_type?.resources; + return resources ? formatResources(resources) : '-'; }; export const getJobListItemSpot = (job: IJob) => { @@ -31,7 +34,7 @@ export const getJobListItemRegion = (job: IJob) => { }; export const getJobListItemBackend = (job: IJob) => { - return job.job_submissions?.[job.job_submissions.length - 1]?.job_provisioning_data?.backend ?? '-'; + return formatBackend(job.job_submissions?.[job.job_submissions.length - 1]?.job_provisioning_data?.backend); }; export const getJobSubmittedAt = (job: IJob) => { diff --git a/frontend/src/pages/Runs/Details/RunDetails/index.tsx b/frontend/src/pages/Runs/Details/RunDetails/index.tsx index e899ad0bc3..408d4cb16b 100644 --- a/frontend/src/pages/Runs/Details/RunDetails/index.tsx +++ b/frontend/src/pages/Runs/Details/RunDetails/index.tsx @@ -1,7 +1,6 @@ import React from 'react'; import { useTranslation } from 'react-i18next'; import { useParams } from 'react-router-dom'; -import { get as _get } from 'lodash'; import { format } from 'date-fns'; import { Box, ColumnLayout, Container, Header, Loader, NavigateLink, StatusIndicator } from 'components'; @@ -29,7 +28,8 @@ import { getRunListItemRegion, getRunListItemResources, getRunListItemSchedule, - getRunListItemSpot, + getRunListItemServiceUrl, + getRunListItemSpotLabelKey, } from '../../List/helpers'; import { EventsList } from '../Events/List'; import { JobList } from '../Jobs/List'; @@ -93,35 +93,14 @@ export const RunDetails = () => { -
    - {t('projects.run.repo')} - -
    - {_get(runData.run_spec.repo_data, 'repo_name', _get(runData.run_spec.repo_data, 'repo_dir', '-'))} -
    -
    - -
    - {t('projects.run.hub_user_name')} - -
    - {runData.user} -
    -
    -
    {t('projects.run.configuration')}
    {runData.run_spec.configuration_path}
    - {t('projects.run.submitted_at')} -
    {format(new Date(runData.submitted_at), DATE_TIME_FORMAT)}
    -
    - -
    - {t('projects.run.finished_at')} -
    {finishedAt ? format(new Date(finishedAt), DATE_TIME_FORMAT) : '-'}
    + {t('projects.run.resources')} +
    {getRunListItemResources(runData)}
    @@ -145,18 +124,26 @@ export const RunDetails = () => { )}
    - {t('projects.run.error')} -
    {getRunError(runData) ?? '-'}
    + {t('projects.run.hub_user_name')} + +
    + {runData.user} +
    - {t('projects.run.priority')} -
    {getRunPriority(runData)}
    + {t('projects.run.submitted_at')} +
    {format(new Date(runData.submitted_at), DATE_TIME_FORMAT)}
    - {t('projects.run.cost')} -
    ${runData.cost}
    + {t('projects.run.finished_at')} +
    {finishedAt ? format(new Date(finishedAt), DATE_TIME_FORMAT) : '-'}
    +
    + +
    + {t('projects.run.error')} +
    {getRunError(runData) ?? '-'}
    @@ -165,8 +152,13 @@ export const RunDetails = () => {
    - {t('projects.run.resources')} -
    {getRunListItemResources(runData)}
    + {t('projects.run.cost')} +
    ${runData.cost}
    +
    + +
    + {t('projects.run.spot')} +
    {t(getRunListItemSpotLabelKey(runData))}
    @@ -180,13 +172,13 @@ export const RunDetails = () => {
    - {t('projects.run.instance_id')} -
    {getRunListItemInstanceId(runData)}
    + {t('projects.run.priority')} +
    {getRunPriority(runData)}
    - {t('projects.run.spot')} -
    {getRunListItemSpot(runData)}
    + {t('projects.run.instance_id')} +
    {getRunListItemInstanceId(runData)}
    diff --git a/frontend/src/pages/Runs/List/Preferences/consts.ts b/frontend/src/pages/Runs/List/Preferences/consts.ts index bffa6b83f6..1e95dfb706 100644 --- a/frontend/src/pages/Runs/List/Preferences/consts.ts +++ b/frontend/src/pages/Runs/List/Preferences/consts.ts @@ -5,21 +5,21 @@ export const DEFAULT_PREFERENCES: CollectionPreferencesProps.Preferences = { contentDisplay: [ { id: 'run_name', visible: true }, { id: 'resources', visible: true }, - { id: 'spot', visible: true }, + { id: 'status', visible: true }, { id: 'hub_user_name', visible: true }, - { id: 'price', visible: true }, { id: 'submitted_at', visible: true }, { id: 'finished_at', visible: true }, - { id: 'status', visible: true }, { id: 'error', visible: true }, + { id: 'price', visible: true }, { id: 'cost', visible: true }, + { id: 'spot', visible: true }, + { id: 'backend', visible: true }, + { id: 'region', visible: true }, // hidden by default { id: 'priority', visible: false }, { id: 'project', visible: false }, { id: 'repo', visible: false }, { id: 'instance', visible: false }, - { id: 'region', visible: false }, - { id: 'backend', visible: false }, ], wrapLines: false, stripedRows: false, diff --git a/frontend/src/pages/Runs/List/helpers.ts b/frontend/src/pages/Runs/List/helpers.ts index 4d9918bc7a..d6eed85d94 100644 --- a/frontend/src/pages/Runs/List/helpers.ts +++ b/frontend/src/pages/Runs/List/helpers.ts @@ -1,6 +1,8 @@ import { groupBy as _groupBy } from 'lodash'; import { getBaseUrl } from 'App/helpers'; +import { formatBackend } from 'libs/fleet'; +import { formatResources } from 'libs/resources'; import { finishedJobs, finishedRunStatuses } from '../constants'; import { getJobStatus } from '../Details/Jobs/List/helpers'; @@ -14,7 +16,8 @@ export const getRunListItemResources = (run: IRun) => { return '-'; } - return run.latest_job_submission?.job_provisioning_data?.instance_type?.resources?.description ?? '-'; + const resources = run.latest_job_submission?.job_provisioning_data?.instance_type?.resources; + return resources ? formatResources(resources) : '-'; }; export const getRunListItemSpotLabelKey = (run: IRun) => { @@ -84,7 +87,7 @@ export const getRunListItemBackend = (run: IRun) => { return '-'; } - return run.latest_job_submission?.job_provisioning_data?.backend ?? '-'; + return formatBackend(run.latest_job_submission?.job_provisioning_data?.backend); }; export const getRunListItemServiceUrl = (run: IRun) => { diff --git a/frontend/src/router.tsx b/frontend/src/router.tsx index 206eecd54e..1ee08eec9c 100644 --- a/frontend/src/router.tsx +++ b/frontend/src/router.tsx @@ -14,7 +14,10 @@ import { FleetAdd, FleetDetails, FleetList } from 'pages/Fleets'; import { EventsList as FleetEventsList } from 'pages/Fleets/Details/Events'; import { FleetDetails as FleetDetailsGeneral } from 'pages/Fleets/Details/FleetDetails'; import { FleetInspect } from 'pages/Fleets/Details/Inspect'; -import { InstanceList } from 'pages/Instances'; +import { InstanceDetailsPage, InstanceList } from 'pages/Instances'; +import { InstanceDetails } from 'pages/Instances/Details/InstanceDetails'; +import { EventsList as InstanceEventsList } from 'pages/Instances/Details/Events'; +import { InstanceInspect } from 'pages/Instances/Details/Inspect'; import { ModelsList } from 'pages/Models'; import { ModelDetails } from 'pages/Models/Details'; import { CreateProjectWizard, ProjectAdd, ProjectDetails, ProjectEvents, ProjectList, ProjectSettings } from 'pages/Project'; @@ -234,6 +237,24 @@ export const router = createBrowserRouter([ path: ROUTES.INSTANCES.LIST, element: , }, + { + path: ROUTES.INSTANCES.DETAILS.TEMPLATE, + element: , + children: [ + { + index: true, + element: , + }, + { + path: ROUTES.INSTANCES.DETAILS.EVENTS.TEMPLATE, + element: , + }, + { + path: ROUTES.INSTANCES.DETAILS.INSPECT.TEMPLATE, + element: , + }, + ], + }, // Volumes { diff --git a/frontend/src/routes.ts b/frontend/src/routes.ts index 45458d5359..e5d06ef942 100644 --- a/frontend/src/routes.ts +++ b/frontend/src/routes.ts @@ -165,6 +165,21 @@ export const ROUTES = { INSTANCES: { LIST: '/instances', + DETAILS: { + TEMPLATE: `/projects/:projectName/instances/:instanceId`, + FORMAT: (projectName: string, instanceId: string) => + buildRoute(ROUTES.INSTANCES.DETAILS.TEMPLATE, { projectName, instanceId }), + EVENTS: { + TEMPLATE: `/projects/:projectName/instances/:instanceId/events`, + FORMAT: (projectName: string, instanceId: string) => + buildRoute(ROUTES.INSTANCES.DETAILS.EVENTS.TEMPLATE, { projectName, instanceId }), + }, + INSPECT: { + TEMPLATE: `/projects/:projectName/instances/:instanceId/inspect`, + FORMAT: (projectName: string, instanceId: string) => + buildRoute(ROUTES.INSTANCES.DETAILS.INSPECT.TEMPLATE, { projectName, instanceId }), + }, + }, }, VOLUMES: { diff --git a/frontend/src/services/instance.ts b/frontend/src/services/instance.ts index e483084b1a..a6107e3d9b 100644 --- a/frontend/src/services/instance.ts +++ b/frontend/src/services/instance.ts @@ -25,6 +25,18 @@ export const instanceApi = createApi({ result ? [...result.map(({ name }) => ({ type: 'Instance' as const, id: name })), 'Instances'] : ['Instances'], }), + getInstanceDetails: builder.query({ + query: ({ projectName, instanceId }) => { + return { + url: API.INSTANCES.DETAILS(projectName), + method: 'POST', + body: { id: instanceId }, + }; + }, + + providesTags: (result) => (result ? [{ type: 'Instance' as const, id: result.name }] : []), + }), + deleteInstances: builder.mutation< void, { projectName: IProject['project_name']; fleetName: string; instancesNums: number[] } @@ -42,4 +54,4 @@ export const instanceApi = createApi({ }), }); -export const { useLazyGetInstancesQuery, useDeleteInstancesMutation } = instanceApi; +export const { useLazyGetInstancesQuery, useGetInstanceDetailsQuery, useDeleteInstancesMutation } = instanceApi; diff --git a/frontend/src/types/instance.d.ts b/frontend/src/types/instance.d.ts index 5a661f26da..585f4f5093 100644 --- a/frontend/src/types/instance.d.ts +++ b/frontend/src/types/instance.d.ts @@ -14,6 +14,8 @@ declare type TInstanceStatus = | 'terminating' | 'terminated'; +declare type THealthStatus = 'healthy' | 'warning' | 'failure'; + declare interface IInstance { id: string; fleet_name: string; @@ -30,7 +32,15 @@ declare interface IInstance { job_status: TJobStatus | null; hostname: string; status: TInstanceStatus; + unreachable: boolean; + health_status: THealthStatus; + termination_reason: string | null; + termination_reason_message: string | null; created: DateTime; + finished_at: DateTime | null; region: string; + availability_zone: string | null; price: number | null; + total_blocks: number | null; + busy_blocks: number; } diff --git a/frontend/src/types/run.d.ts b/frontend/src/types/run.d.ts index e624a63e3b..3eac746218 100644 --- a/frontend/src/types/run.d.ts +++ b/frontend/src/types/run.d.ts @@ -269,7 +269,9 @@ declare interface IResources { spot: boolean; disk?: IDisk; + cpu_arch?: string | null; + /** @deprecated Use formatResources() from libs/resources instead. Remove in 0.21. */ description?: string; } diff --git a/src/dstack/_internal/core/models/instances.py b/src/dstack/_internal/core/models/instances.py index 012916f97e..7eccee8b69 100644 --- a/src/dstack/_internal/core/models/instances.py +++ b/src/dstack/_internal/core/models/instances.py @@ -1,10 +1,10 @@ import datetime from enum import Enum -from typing import Any, Dict, List, Optional +from typing import Annotated, Any, Dict, List, Optional from uuid import UUID import gpuhunt -from pydantic import root_validator +from pydantic import Field, root_validator from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.common import ( @@ -56,8 +56,11 @@ class Resources(CoreModel): spot: bool disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility cpu_arch: Optional[gpuhunt.CPUArchitecture] = None - # TODO: make description a computed field after migrating to pydanticV2 - description: str = "" + # Deprecated: description is now generated client-side. TODO: remove in 0.21. + description: Annotated[ + str, + Field(description="Deprecated: generated client-side. Will be removed in 0.21."), + ] = "" @root_validator def _description(cls, values) -> Dict: @@ -339,6 +342,7 @@ class Instance(CoreModel): termination_reason: Optional[str] = None termination_reason_message: Optional[str] = None created: datetime.datetime + finished_at: Optional[datetime.datetime] = None region: Optional[str] = None availability_zone: Optional[str] = None price: Optional[float] = None diff --git a/src/dstack/_internal/server/services/instances.py b/src/dstack/_internal/server/services/instances.py index f37e1c9682..046f092c03 100644 --- a/src/dstack/_internal/server/services/instances.py +++ b/src/dstack/_internal/server/services/instances.py @@ -228,6 +228,7 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance: ), termination_reason_message=instance_model.termination_reason_message, created=instance_model.created_at, + finished_at=instance_model.finished_at, total_blocks=instance_model.total_blocks, busy_blocks=instance_model.busy_blocks, ) diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py index fef712acae..1a250612ba 100644 --- a/src/tests/_internal/server/routers/test_fleets.py +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -411,6 +411,7 @@ async def test_creates_fleet(self, test_db, session: AsyncSession, client: Async "termination_reason": None, "termination_reason_message": None, "created": "2023-01-02T03:04:00+00:00", + "finished_at": None, "backend": None, "region": None, "availability_zone": None, @@ -554,6 +555,7 @@ async def test_creates_ssh_fleet(self, test_db, session: AsyncSession, client: A "termination_reason": None, "termination_reason_message": None, "created": "2023-01-02T03:04:00+00:00", + "finished_at": None, "region": "remote", "availability_zone": None, "price": 0.0, @@ -733,6 +735,7 @@ async def test_updates_ssh_fleet(self, test_db, session: AsyncSession, client: A "termination_reason": "terminated_by_user", "termination_reason_message": None, "created": "2023-01-02T03:04:00+00:00", + "finished_at": None, "region": "remote", "availability_zone": None, "price": 0.0, @@ -767,6 +770,7 @@ async def test_updates_ssh_fleet(self, test_db, session: AsyncSession, client: A "termination_reason": None, "termination_reason_message": None, "created": "2023-01-02T03:04:00+00:00", + "finished_at": None, "region": "remote", "availability_zone": None, "price": 0.0, From 733a17c7746dda39112f85cec1a7bf601778c656 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:22:07 +0100 Subject: [PATCH 171/187] Add Crusoe Cloud backend (#3602) * Add Crusoe Cloud backend Add a VM-based Crusoe Cloud backend supporting single-node and multi-node (cluster) provisioning with InfiniBand. Key features: - gpuhunt online provider for offers with project quota filtering - HMAC-SHA256 authenticated REST API client - Image selection based on GPU type (SXM/PCIe/ROCm/CPU) - Storage: persistent data disk for types without ephemeral NVMe; auto-detects and RAID-0s NVMe for types with ephemeral storage; moves containerd storage so containers get the full disk space - Cluster support via IB partitions - Two-phase termination with data disk cleanup Tested end-to-end: - L40S: fleet, dev env, GPU, configurable disk (200GB), clean termination - A100-PCIe: fleet, dev env, GPU, NVMe auto-mount (880GB), clean termination - A100-SXM-IB cluster: IB partition created, 1 node provisioned with IB and 8x NVMe RAID-0 (7TB); 2nd node failed on capacity (out_of_stock) - Offers: quota enforcement, disk sizes correct per instance type Not tested (no capacity/quota): - H100-SXM-IB, MI300X-IB, MI355X-RoCE (no hardware available) - CPU-only instances c1a/s1a (no quota) - Spot provisioning (disabled in gpuhunt, see TODO) - Full 2-node cluster with IB connectivity test TODOs: - Spot: disabled until Crusoe confirms how to request spot billing via the VM create API endpoint - gpuhunt dependency: currently installed from PR branch; switch to pinned version after gpuhunt PR #211 is merged and released AI Assistance: This implementation was developed with AI assistance. Co-authored-by: Cursor * Fetch Crusoe locations dynamically instead of hardcoding Co-authored-by: Cursor * Fix VM image selection for SXM instance types The _get_image function checked gpu_type (e.g. 'A100') for 'SXM', but gpuhunt normalizes GPU names and strips the SXM qualifier. Check the instance type name instead (e.g. 'a100-80gb-sxm-ib.8x') which preserves the '-sxm' indicator. Without this fix, SXM-IB instances used the PCIe docker image which lacks IB drivers, HPC-X, and NCCL topology files. Verified with a 2-node A100-SXM-IB NCCL all_reduce test: 193 GB/s bus bandwidth. Made-with: Cursor * Switch gpuhunt dependency from PR branch to main Made-with: Cursor * Add TODOs to pin gpuhunt and remove allow-direct-references before merging Made-with: Cursor * Pin gpuhunt==0.1.17 (matches master) Made-with: Cursor --------- Co-authored-by: Cursor --- docs/docs/concepts/backends.md | 28 ++ docs/docs/reference/server/config.yml.md | 17 + frontend/src/types/backend.d.ts | 1 + pyproject.toml | 5 +- .../_internal/core/backends/configurators.py | 9 + .../_internal/core/backends/crusoe/backend.py | 16 + .../_internal/core/backends/crusoe/compute.py | 436 ++++++++++++++++++ .../core/backends/crusoe/configurator.py | 78 ++++ .../_internal/core/backends/crusoe/models.py | 48 ++ .../core/backends/crusoe/resources.py | 198 ++++++++ src/dstack/_internal/core/backends/models.py | 8 + .../_internal/core/models/backends/base.py | 2 + .../_internal/server/routers/test_backends.py | 1 + .../server/services/test_backend_configs.py | 41 ++ 14 files changed, 887 insertions(+), 1 deletion(-) create mode 100644 src/dstack/_internal/core/backends/crusoe/backend.py create mode 100644 src/dstack/_internal/core/backends/crusoe/compute.py create mode 100644 src/dstack/_internal/core/backends/crusoe/configurator.py create mode 100644 src/dstack/_internal/core/backends/crusoe/models.py create mode 100644 src/dstack/_internal/core/backends/crusoe/resources.py diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index 0213b669d4..5446cef160 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -929,6 +929,34 @@ projects: * `sizes` - read * `ssh_key` - create, read, update,delete +### Crusoe Cloud + +Log into your [Crusoe Cloud](https://console.crusoecloud.com/) console and create an API key +under your account settings. Note your project ID from the project settings page. + +Then, go ahead and configure the backend: + +
    + +```yaml +projects: +- name: main + backends: + - type: crusoe + project_id: your-project-id + creds: + type: access_key + access_key: your-access-key + secret_key: your-secret-key + regions: + - us-east1-a + - us-southcentral1-a +``` + +
    + +`regions` is optional. If not specified, all available Crusoe regions are used. + ### Hot Aisle Log in to the SSH TUI as described in the [Hot Aisle Quick Start](https://hotaisle.xyz/quick-start/). diff --git a/docs/docs/reference/server/config.yml.md b/docs/docs/reference/server/config.yml.md index 26c01d73e2..80e48b028e 100644 --- a/docs/docs/reference/server/config.yml.md +++ b/docs/docs/reference/server/config.yml.md @@ -335,6 +335,23 @@ to configure [backends](../../concepts/backends.md) and other [server-level sett type: required: true +##### `projects[n].backends[type=crusoe]` { #crusoe data-toc-label="crusoe" } + +#SCHEMA# dstack._internal.core.backends.crusoe.models.CrusoeBackendConfigWithCreds + overrides: + show_root_heading: false + type: + required: true + item_id_prefix: crusoe- + +###### `projects[n].backends[type=crusoe].creds` { #crusoe-creds data-toc-label="creds" } + +#SCHEMA# dstack._internal.core.backends.crusoe.models.CrusoeAccessKeyCreds + overrides: + show_root_heading: false + type: + required: true + ##### `projects[n].backends[type=hotaisle]` { #hotaisle data-toc-label="hotaisle" } #SCHEMA# dstack._internal.core.backends.hotaisle.models.HotAisleBackendConfigWithCreds diff --git a/frontend/src/types/backend.d.ts b/frontend/src/types/backend.d.ts index b41d796cee..dfe74648d4 100644 --- a/frontend/src/types/backend.d.ts +++ b/frontend/src/types/backend.d.ts @@ -1,6 +1,7 @@ declare type TBackendType = | 'aws' | 'azure' + | 'crusoe' | 'cudo' | 'datacrunch' | 'dstack' diff --git a/pyproject.toml b/pyproject.toml index 6c5005b43c..3336fc5423 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -256,6 +256,9 @@ fluentbit = [ "elasticsearch>=8.0.0", "dstack[server]", ] +crusoe = [ + "dstack[server]", +] all = [ - "dstack[gateway,server,aws,azure,gcp,verda,kubernetes,lambda,nebius,oci,fluentbit]", + "dstack[gateway,server,aws,azure,gcp,verda,kubernetes,lambda,nebius,oci,crusoe,fluentbit]", ] diff --git a/src/dstack/_internal/core/backends/configurators.py b/src/dstack/_internal/core/backends/configurators.py index ec7f976c53..75a4a86abb 100644 --- a/src/dstack/_internal/core/backends/configurators.py +++ b/src/dstack/_internal/core/backends/configurators.py @@ -35,6 +35,15 @@ except ImportError: pass +try: + from dstack._internal.core.backends.crusoe.configurator import ( + CrusoeConfigurator, + ) + + _CONFIGURATOR_CLASSES.append(CrusoeConfigurator) +except ImportError: + pass + try: from dstack._internal.core.backends.cudo.configurator import ( CudoConfigurator, diff --git a/src/dstack/_internal/core/backends/crusoe/backend.py b/src/dstack/_internal/core/backends/crusoe/backend.py new file mode 100644 index 0000000000..9f81f136d1 --- /dev/null +++ b/src/dstack/_internal/core/backends/crusoe/backend.py @@ -0,0 +1,16 @@ +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.crusoe.compute import CrusoeCompute +from dstack._internal.core.backends.crusoe.models import CrusoeConfig +from dstack._internal.core.models.backends.base import BackendType + + +class CrusoeBackend(Backend): + TYPE = BackendType.CRUSOE + COMPUTE_CLASS = CrusoeCompute + + def __init__(self, config: CrusoeConfig): + self.config = config + self._compute = CrusoeCompute(self.config) + + def compute(self) -> CrusoeCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/crusoe/compute.py b/src/dstack/_internal/core/backends/crusoe/compute.py new file mode 100644 index 0000000000..10ede96776 --- /dev/null +++ b/src/dstack/_internal/core/backends/crusoe/compute.py @@ -0,0 +1,436 @@ +from collections.abc import Iterable +from typing import List, Optional + +import gpuhunt +from gpuhunt.providers.crusoe import CrusoeProvider + +from dstack._internal.core.backends.base.backend import Compute +from dstack._internal.core.backends.base.compute import ( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithMultinodeSupport, + ComputeWithPlacementGroupSupport, + ComputeWithPrivilegedSupport, + generate_unique_instance_name, + get_shim_commands, +) +from dstack._internal.core.backends.base.offers import ( + OfferModifier, + get_catalog_offers, + get_offers_disk_modifier, +) +from dstack._internal.core.backends.crusoe.models import CrusoeConfig +from dstack._internal.core.backends.crusoe.resources import CrusoeClient +from dstack._internal.core.errors import BackendError, NotYetTerminated +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.common import CoreModel +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceConfiguration, + InstanceOffer, + InstanceOfferWithAvailability, +) +from dstack._internal.core.models.placement import ( + PlacementGroup, + PlacementGroupProvisioningData, + PlacementStrategy, +) +from dstack._internal.core.models.resources import Memory, Range +from dstack._internal.core.models.runs import JobProvisioningData, Requirements +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +# Range for the persistent data disk created for instance types without ephemeral NVMe. +CONFIGURABLE_DISK_SIZE = Range[Memory]( + min=Memory.parse("50GB"), + max=Memory.parse("5000GB"), +) +WAIT_FOR_DISK_TIMEOUT = 30 +WAIT_FOR_VM_TIMEOUT = 120 + +SETUP_COMMANDS = [ + 'sed -i "s/.*AllowTcpForwarding.*/AllowTcpForwarding yes/g" /etc/ssh/sshd_config', + "service ssh restart", +] + +# Set up storage on the best available disk and move containerd there. +# Docker on Crusoe images delegates image storage to containerd's native snapshotter, +# so /var/lib/containerd is what determines container disk space. +# Handles: /dev/vdb (persistent data disk we create) or /dev/nvme* (ephemeral NVMe). +# For multiple NVMe drives, uses mdadm RAID-0 for maximum space. +STORAGE_SETUP_COMMANDS = [ + ( + "DISK='' && " + "if [ -b /dev/vdb ]; then DISK=/dev/vdb; " + "elif ls /dev/nvme*n1 >/dev/null 2>&1; then" + " NVME_DEVS=$(ls /dev/nvme*n1 2>/dev/null);" + " NVME_COUNT=$(echo $NVME_DEVS | wc -w);" + " if [ $NVME_COUNT -eq 1 ]; then DISK=$NVME_DEVS;" + " elif [ $NVME_COUNT -gt 1 ]; then" + " apt-get install -y -qq mdadm >/dev/null 2>&1 || true;" + " mdadm --create /dev/md0 --level=0 --raid-devices=$NVME_COUNT $NVME_DEVS --force --run;" + " DISK=/dev/md0;" + " fi;" + "fi && " + 'if [ -n "$DISK" ]; then' + " mkfs.ext4 -q -F $DISK" + " && mkdir -p /data" + " && mount $DISK /data" + " && service docker stop" + " && systemctl stop containerd || true" + " && mkdir -p /data/containerd" + " && rsync -a /var/lib/containerd/ /data/containerd/" + " && mount --bind /data/containerd /var/lib/containerd" + " && systemctl start containerd || true" + " && service docker start" + "; fi" + ), +] + +IMAGE_SXM_DOCKER = "ubuntu22.04-nvidia-sxm-docker:latest" +IMAGE_PCIE_DOCKER = "ubuntu22.04-nvidia-pcie-docker:latest" +IMAGE_ROCM = "ubuntu-rocm:latest" +IMAGE_BASE = "ubuntu22.04:latest" + + +def _get_image(instance_name: str, gpu_type: str) -> str: + if not gpu_type: + return IMAGE_BASE + # Check instance name for SXM -- gpu_type from gpuhunt is normalized (e.g. "A100") + # and doesn't contain "SXM", but instance names like "a100-80gb-sxm-ib.8x" do. + if "-sxm" in instance_name.lower(): + return IMAGE_SXM_DOCKER + if "MI3" in gpu_type: + return IMAGE_ROCM + return IMAGE_PCIE_DOCKER + + +def _is_ib_type(instance_name: str) -> bool: + prefix = instance_name.split(".")[0] + return prefix.endswith("-ib") or prefix.endswith("-roce") + + +def _get_instance_family(instance_name: str) -> str: + return instance_name.rsplit(".", 1)[0] + + +def _has_ephemeral_disk(offer: InstanceOffer) -> bool: + """Check if the instance type has ephemeral NVMe storage via gpuhunt provider_data.""" + backend_data = offer.backend_data or {} + return backend_data.get("disk_gb", 0) > 0 + + +class CrusoeCompute( + ComputeWithAllOffersCached, + ComputeWithCreateInstanceSupport, + ComputeWithPrivilegedSupport, + ComputeWithMultinodeSupport, + ComputeWithPlacementGroupSupport, + Compute, +): + def __init__(self, config: CrusoeConfig): + super().__init__() + self.config = config + self._client = CrusoeClient(config.creds, config.project_id) + self._catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False) + self._catalog.add_provider( + CrusoeProvider( + access_key=config.creds.access_key, + secret_key=config.creds.secret_key, + project_id=config.project_id, + ) + ) + + def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=BackendType.CRUSOE, + locations=self.config.regions or None, + catalog=self._catalog, + ) + quota_map = self._get_quota_map() + result = [] + for offer in offers: + family = _get_instance_family(offer.instance.name) + availability = InstanceAvailability.UNKNOWN + for prog_name, available in quota_map.items(): + if family.startswith(prog_name) or prog_name.startswith(family): + availability = ( + InstanceAvailability.AVAILABLE + if available > 0 + else InstanceAvailability.NO_QUOTA + ) + break + result.append( + InstanceOfferWithAvailability( + **offer.dict(), + availability=availability, + ) + ) + return result + + def _get_quota_map(self) -> dict[str, int]: + try: + quotas = self._client.list_quotas() + except Exception: + logger.warning("Failed to fetch Crusoe quotas, availability will be UNKNOWN") + return {} + result = {} + for q in quotas: + prog_name = q.get("programmatic_name", "") + available = q.get("available", 0) + category = q.get("category", "") + if "Instance" in category: + result[prog_name] = available + return result + + def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]: + # Only adjust disk size for types without ephemeral NVMe (disk_gb == 0). + # Types with ephemeral NVMe already have their disk_size set by gpuhunt. + base_modifier = get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements) + + def modifier( + offer: InstanceOfferWithAvailability, + ) -> Optional[InstanceOfferWithAvailability]: + if _has_ephemeral_disk(offer): + return offer + return base_modifier(offer) + + return [modifier] + + def create_instance( + self, + instance_offer: InstanceOfferWithAvailability, + instance_config: InstanceConfiguration, + placement_group: Optional[PlacementGroup], + ) -> JobProvisioningData: + instance_name = generate_unique_instance_name(instance_config) + region = instance_offer.region + + ib_partition_id = None + if placement_group: + assert placement_group.provisioning_data is not None + pg_data = CrusoePlacementGroupBackendData.load( + placement_group.provisioning_data.backend_data + ) + ib_partition_id = pg_data.ib_partition_id + + gpus = instance_offer.instance.resources.gpus + gpu_type = gpus[0].name if gpus else "" + instance_type_name = instance_offer.instance.name + image = _get_image(instance_type_name, gpu_type) + + needs_data_disk = not _has_ephemeral_disk(instance_offer) + # Always include storage setup: it auto-detects /dev/vdb (data disk) or + # /dev/nvme* (ephemeral NVMe) and moves containerd storage there. + commands = SETUP_COMMANDS + STORAGE_SETUP_COMMANDS + get_shim_commands(is_privileged=True) + startup_script = "#!/bin/bash\nset -e\n" + " && ".join(commands) + + data_disk_id = None + create_op = None + try: + if needs_data_disk: + disk_size_mib = instance_offer.instance.resources.disk.size_mib + disk_size_gib = max(disk_size_mib // 1024, 1) + disk_op = self._client.create_disk( + name=f"{instance_name}-data", + size=f"{disk_size_gib}GiB", + location=region, + ) + data_disk_id = disk_op["metadata"]["id"] + self._client.wait_for_disk_operation( + disk_op["operation_id"], timeout=WAIT_FOR_DISK_TIMEOUT + ) + + disks = None + if data_disk_id: + disks = [ + {"disk_id": data_disk_id, "mode": "read-write", "attachment_type": "data"} + ] + + host_channel_adapters = None + if ib_partition_id: + host_channel_adapters = [{"ib_partition_id": ib_partition_id}] + + create_op = self._client.create_vm( + name=instance_name, + vm_type=instance_type_name, + location=region, + ssh_public_key=instance_config.get_public_keys()[0], + image=image, + startup_script=startup_script, + disks=disks, + host_channel_adapters=host_channel_adapters, + ) + vm_id = create_op["metadata"]["id"] + self._client.wait_for_vm_operation( + create_op["operation_id"], timeout=WAIT_FOR_VM_TIMEOUT + ) + except BaseException: + if create_op is not None: + vm_id_to_delete = create_op.get("metadata", {}).get("id") + if vm_id_to_delete: + try: + self._client.delete_vm(vm_id_to_delete) + except Exception as e: + logger.exception("Could not delete VM %s: %s", vm_id_to_delete, e) + if data_disk_id: + try: + self._client.delete_disk(data_disk_id) + except Exception as e: + logger.exception("Could not delete disk %s: %s", data_disk_id, e) + raise + + return JobProvisioningData( + backend=instance_offer.backend, + instance_type=instance_offer.instance, + instance_id=vm_id, + hostname=None, + region=region, + price=instance_offer.price, + ssh_port=22, + username="ubuntu", + dockerized=True, + backend_data=CrusoeInstanceBackendData(data_disk_id=data_disk_id).json(), + ) + + def update_provisioning_data( + self, provisioning_data, project_ssh_public_key, project_ssh_private_key + ): + try: + vm = self._client.get_vm(provisioning_data.instance_id) + except Exception: + return + interfaces = vm.get("network_interfaces", []) + if not interfaces: + return + ips = interfaces[0].get("ips", []) + if not ips: + return + public_ipv4 = ips[0].get("public_ipv4", {}) + private_ipv4 = ips[0].get("private_ipv4", {}) + if public_ipv4.get("address"): + provisioning_data.hostname = public_ipv4["address"] + if private_ipv4.get("address"): + provisioning_data.internal_ip = private_ipv4["address"] + + def terminate_instance( + self, instance_id: str, region: str, backend_data: Optional[str] = None + ): + backend_data_parsed = CrusoeInstanceBackendData.load(backend_data) + try: + vm = self._client.get_vm(instance_id) + except BackendError: + # VM not found (404) or other API error -- treat as already deleted + vm = None + + if vm is not None: + state = vm.get("state", "") + if state not in ("STATE_DELETING", "STATE_DELETED"): + try: + self._client.delete_vm(instance_id) + except BackendError: + pass + raise NotYetTerminated(f"Requested VM deletion. State was: {state}") + else: + raise NotYetTerminated(f"Waiting for VM deletion. State: {state}") + + # OS disk is auto-deleted with the VM. Data disk must be deleted separately. + if backend_data_parsed.data_disk_id: + try: + self._client.delete_disk(backend_data_parsed.data_disk_id) + except BackendError: + pass + + def create_placement_group( + self, + placement_group: PlacementGroup, + master_instance_offer: InstanceOffer, + ) -> PlacementGroupProvisioningData: + assert placement_group.configuration.placement_strategy == PlacementStrategy.CLUSTER + instance_name = master_instance_offer.instance.name + region = placement_group.configuration.region + + if not _is_ib_type(instance_name): + return PlacementGroupProvisioningData( + backend=BackendType.CRUSOE, + backend_data=CrusoePlacementGroupBackendData( + ib_partition_id=None, ib_network_id=None + ).json(), + ) + + ib_networks = self._client.list_ib_networks() + target_network = None + for net in ib_networks: + if net.get("location") != region: + continue + for cap in net.get("capacities", []): + if cap.get("slice_type") == instance_name: + target_network = net + break + if target_network: + break + + if target_network is None: + raise BackendError( + f"No IB network found in {region} for instance type {instance_name}" + ) + + partition = self._client.create_ib_partition( + name=placement_group.name, + ib_network_id=target_network["id"], + ) + return PlacementGroupProvisioningData( + backend=BackendType.CRUSOE, + backend_data=CrusoePlacementGroupBackendData( + ib_partition_id=partition["id"], + ib_network_id=target_network["id"], + ).json(), + ) + + def delete_placement_group(self, placement_group: PlacementGroup) -> None: + assert placement_group.provisioning_data is not None + pg_data = CrusoePlacementGroupBackendData.load( + placement_group.provisioning_data.backend_data + ) + if pg_data.ib_partition_id: + try: + self._client.delete_ib_partition(pg_data.ib_partition_id) + except BackendError: + pass + + def is_suitable_placement_group( + self, + placement_group: PlacementGroup, + instance_offer: InstanceOffer, + ) -> bool: + if placement_group.configuration.region != instance_offer.region: + return False + assert placement_group.provisioning_data is not None + pg_data = CrusoePlacementGroupBackendData.load( + placement_group.provisioning_data.backend_data + ) + if pg_data.ib_partition_id is None: + return not _is_ib_type(instance_offer.instance.name) + return _is_ib_type(instance_offer.instance.name) + + +class CrusoeInstanceBackendData(CoreModel): + data_disk_id: Optional[str] = None + + @classmethod + def load(cls, raw: Optional[str]) -> "CrusoeInstanceBackendData": + if raw is None: + return cls() + return cls.__response__.parse_raw(raw) + + +class CrusoePlacementGroupBackendData(CoreModel): + ib_partition_id: Optional[str] = None + ib_network_id: Optional[str] = None + + @classmethod + def load(cls, raw: Optional[str]) -> "CrusoePlacementGroupBackendData": + if raw is None: + return cls() + return cls.__response__.parse_raw(raw) diff --git a/src/dstack/_internal/core/backends/crusoe/configurator.py b/src/dstack/_internal/core/backends/crusoe/configurator.py new file mode 100644 index 0000000000..95f805458e --- /dev/null +++ b/src/dstack/_internal/core/backends/crusoe/configurator.py @@ -0,0 +1,78 @@ +import json + +from dstack._internal.core.backends.base.configurator import ( + BackendRecord, + Configurator, + raise_invalid_credentials_error, +) +from dstack._internal.core.backends.crusoe.backend import CrusoeBackend +from dstack._internal.core.backends.crusoe.models import ( + CrusoeBackendConfig, + CrusoeBackendConfigWithCreds, + CrusoeConfig, + CrusoeCreds, + CrusoeStoredConfig, +) +from dstack._internal.core.backends.crusoe.resources import CrusoeClient +from dstack._internal.core.models.backends.base import BackendType + + +class CrusoeConfigurator( + Configurator[ + CrusoeBackendConfig, + CrusoeBackendConfigWithCreds, + ] +): + TYPE = BackendType.CRUSOE + BACKEND_CLASS = CrusoeBackend + + def validate_config(self, config: CrusoeBackendConfigWithCreds, default_creds_enabled: bool): + try: + client = CrusoeClient(config.creds, config.project_id) + client.list_quotas() + except Exception as e: + raise_invalid_credentials_error( + fields=[["creds"]], + details=str(e), + ) + if config.regions: + try: + available = set(client.list_locations()) + except Exception: + return + invalid = set(config.regions) - available + if invalid: + raise_invalid_credentials_error( + fields=[["regions"]], + details=( + f"Unknown regions: {sorted(invalid)}. Valid regions: {sorted(available)}" + ), + ) + + def create_backend( + self, project_name: str, config: CrusoeBackendConfigWithCreds + ) -> BackendRecord: + return BackendRecord( + config=CrusoeStoredConfig( + **CrusoeBackendConfig.__response__.parse_obj(config).dict() + ).json(), + auth=CrusoeCreds.parse_obj(config.creds).json(), + ) + + def get_backend_config_with_creds(self, record: BackendRecord) -> CrusoeBackendConfigWithCreds: + config = self._get_config(record) + return CrusoeBackendConfigWithCreds.__response__.parse_obj(config) + + def get_backend_config_without_creds(self, record: BackendRecord) -> CrusoeBackendConfig: + config = self._get_config(record) + return CrusoeBackendConfig.__response__.parse_obj(config) + + def get_backend(self, record: BackendRecord) -> CrusoeBackend: + config = self._get_config(record) + return CrusoeBackend(config=config) + + def _get_config(self, record: BackendRecord) -> CrusoeConfig: + return CrusoeConfig.__response__( + **json.loads(record.config), + creds=CrusoeCreds.parse_raw(record.auth), + ) diff --git a/src/dstack/_internal/core/backends/crusoe/models.py b/src/dstack/_internal/core/backends/crusoe/models.py new file mode 100644 index 0000000000..f405eca1b7 --- /dev/null +++ b/src/dstack/_internal/core/backends/crusoe/models.py @@ -0,0 +1,48 @@ +from typing import Annotated, List, Literal, Optional, Union + +from pydantic import Field + +from dstack._internal.core.models.common import CoreModel + + +class CrusoeAccessKeyCreds(CoreModel): + type: Annotated[Literal["access_key"], Field(description="The type of credentials")] = ( + "access_key" + ) + access_key: Annotated[str, Field(description="The Crusoe API access key")] + secret_key: Annotated[str, Field(description="The Crusoe API secret key")] + + +AnyCrusoeCreds = CrusoeAccessKeyCreds +CrusoeCreds = AnyCrusoeCreds + + +class CrusoeBackendConfig(CoreModel): + type: Annotated[ + Literal["crusoe"], + Field(description="The type of backend"), + ] = "crusoe" + project_id: Annotated[str, Field(description="The Crusoe Cloud project ID")] + regions: Annotated[ + Optional[List[str]], + Field(description="The list of allowed Crusoe regions. Omit to use all regions"), + ] = None + + +class CrusoeBackendConfigWithCreds(CrusoeBackendConfig): + creds: Annotated[AnyCrusoeCreds, Field(description="The credentials")] + + +AnyCrusoeBackendConfig = Union[CrusoeBackendConfig, CrusoeBackendConfigWithCreds] + + +class CrusoeBackendFileConfigWithCreds(CrusoeBackendConfig): + creds: Annotated[AnyCrusoeCreds, Field(description="The credentials")] + + +class CrusoeStoredConfig(CrusoeBackendConfig): + pass + + +class CrusoeConfig(CrusoeStoredConfig): + creds: AnyCrusoeCreds diff --git a/src/dstack/_internal/core/backends/crusoe/resources.py b/src/dstack/_internal/core/backends/crusoe/resources.py new file mode 100644 index 0000000000..1f84ff4019 --- /dev/null +++ b/src/dstack/_internal/core/backends/crusoe/resources.py @@ -0,0 +1,198 @@ +import base64 +import datetime +import hashlib +import hmac +import time +from typing import Any, Dict, List, Optional + +import requests + +from dstack._internal.core.backends.crusoe.models import CrusoeAccessKeyCreds +from dstack._internal.core.errors import BackendError, NoCapacityError, ProvisioningError +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + +API_URL = "https://api.crusoecloud.com" +API_VERSION = "/v1alpha5" +SIGNATURE_VERSION = "1.0" +REQUEST_TIMEOUT = 30 + + +class CrusoeClient: + def __init__(self, creds: CrusoeAccessKeyCreds, project_id: str): + self.access_key = creds.access_key + self.secret_key = creds.secret_key + self.project_id = project_id + + def _request( + self, + method: str, + path: str, + params: Optional[dict] = None, + body: Optional[dict] = None, + ) -> requests.Response: + dt = str(datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0)) + dt = dt.replace(" ", "T") + + query_string = "" + if params: + query_string = "&".join(f"{k}={v}" for k, v in sorted(params.items())) + + payload = f"{API_VERSION}{path}\n{query_string}\n{method}\n{dt}\n" + + decoded_secret = base64.urlsafe_b64decode( + self.secret_key + "=" * (-len(self.secret_key) % 4) + ) + sig = hmac.new(decoded_secret, msg=payload.encode("ascii"), digestmod=hashlib.sha256) + encoded_sig = base64.urlsafe_b64encode(sig.digest()).decode("ascii").rstrip("=") + + headers = { + "X-Crusoe-Timestamp": dt, + "Authorization": f"Bearer {SIGNATURE_VERSION}:{self.access_key}:{encoded_sig}", + } + if body is not None: + headers["Content-Type"] = "application/json" + + url = f"{API_URL}{API_VERSION}{path}" + resp = requests.request( + method, url, headers=headers, params=params, json=body, timeout=REQUEST_TIMEOUT + ) + if resp.status_code >= 400: + _raise_api_error(resp) + return resp + + def _project_path(self, path: str) -> str: + return f"/projects/{self.project_id}{path}" + + # --- VM operations --- + + def create_vm( + self, + name: str, + vm_type: str, + location: str, + ssh_public_key: str, + image: str, + startup_script: str, + disks: Optional[List[Dict[str, str]]] = None, + host_channel_adapters: Optional[List[Dict[str, str]]] = None, + ) -> dict: + body: Dict[str, Any] = { + "name": name, + "type": vm_type, + "location": location, + "ssh_public_key": ssh_public_key, + "image": image, + "startup_script": startup_script, + } + if disks: + body["disks"] = disks + if host_channel_adapters: + body["host_channel_adapters"] = host_channel_adapters + resp = self._request("POST", self._project_path("/compute/vms/instances"), body=body) + return resp.json()["operation"] + + def get_vm(self, vm_id: str) -> dict: + resp = self._request("GET", self._project_path(f"/compute/vms/instances/{vm_id}")) + return resp.json() + + def delete_vm(self, vm_id: str) -> dict: + resp = self._request("DELETE", self._project_path(f"/compute/vms/instances/{vm_id}")) + return resp.json()["operation"] + + def get_vm_operation(self, operation_id: str) -> dict: + resp = self._request( + "GET", self._project_path(f"/compute/vms/instances/operations/{operation_id}") + ) + return resp.json() + + # --- Disk operations --- + + def create_disk(self, name: str, size: str, location: str) -> dict: + body = { + "name": name, + "size": size, + "location": location, + "type": "persistent-ssd", + "block_size": 4096, + } + resp = self._request("POST", self._project_path("/storage/disks"), body=body) + return resp.json()["operation"] + + def delete_disk(self, disk_id: str) -> dict: + resp = self._request("DELETE", self._project_path(f"/storage/disks/{disk_id}")) + return resp.json()["operation"] + + def get_disk_operation(self, operation_id: str) -> dict: + resp = self._request( + "GET", self._project_path(f"/storage/disks/operations/{operation_id}") + ) + return resp.json() + + # --- Quota operations --- + + def list_quotas(self) -> List[dict]: + resp = self._request("GET", self._project_path("/quotas")) + return resp.json().get("quotas", []) + + # --- Location operations --- + + def list_locations(self) -> List[str]: + resp = self._request("GET", "/locations") + return resp.json().get("items", []) + + # --- IB operations --- + + def list_ib_networks(self) -> List[dict]: + resp = self._request("GET", self._project_path("/networking/ib-networks")) + return resp.json().get("items", []) + + def create_ib_partition(self, name: str, ib_network_id: str) -> dict: + body = {"name": name, "ib_network_id": ib_network_id} + resp = self._request("POST", self._project_path("/networking/ib-partitions"), body=body) + return resp.json() + + def delete_ib_partition(self, partition_id: str) -> None: + self._request("DELETE", self._project_path(f"/networking/ib-partitions/{partition_id}")) + + # --- Operation polling --- + + def wait_for_vm_operation( + self, operation_id: str, timeout: float = 120, interval: float = 5 + ) -> dict: + return self._wait_for_operation(operation_id, self.get_vm_operation, timeout, interval) + + def wait_for_disk_operation( + self, operation_id: str, timeout: float = 30, interval: float = 2 + ) -> dict: + return self._wait_for_operation(operation_id, self.get_disk_operation, timeout, interval) + + def _wait_for_operation(self, operation_id, get_fn, timeout, interval) -> dict: + deadline = time.monotonic() + timeout + while True: + op = get_fn(operation_id) + state = op.get("state", op.get("operation", {}).get("state")) + if state == "SUCCEEDED": + return op + if state == "FAILED": + result = op.get("result", {}) + code = result.get("code", "") + message = result.get("message", str(result)) + if code == "out_of_stock": + raise NoCapacityError(message) + raise ProvisioningError(f"Operation {operation_id} failed: {message}") + if time.monotonic() + interval > deadline: + raise BackendError(f"Operation {operation_id} timed out (state: {state})") + time.sleep(interval) + + +def _raise_api_error(resp: requests.Response) -> None: + try: + data = resp.json() + message = data.get("message", data.get("error", str(data))) + except Exception: + message = resp.text[:500] + if resp.status_code == 404: + raise BackendError(f"Resource not found: {message}") + raise BackendError(f"Crusoe API error ({resp.status_code}): {message}") diff --git a/src/dstack/_internal/core/backends/models.py b/src/dstack/_internal/core/backends/models.py index f1c59e2f44..36a7856e38 100644 --- a/src/dstack/_internal/core/backends/models.py +++ b/src/dstack/_internal/core/backends/models.py @@ -12,6 +12,11 @@ CloudRiftBackendConfig, CloudRiftBackendConfigWithCreds, ) +from dstack._internal.core.backends.crusoe.models import ( + CrusoeBackendConfig, + CrusoeBackendConfigWithCreds, + CrusoeBackendFileConfigWithCreds, +) from dstack._internal.core.backends.cudo.models import ( CudoBackendConfig, CudoBackendConfigWithCreds, @@ -79,6 +84,7 @@ AWSBackendConfig, AzureBackendConfig, CloudRiftBackendConfig, + CrusoeBackendConfig, CudoBackendConfig, BaseDigitalOceanBackendConfig, GCPBackendConfig, @@ -103,6 +109,7 @@ AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, CloudRiftBackendConfigWithCreds, + CrusoeBackendConfigWithCreds, CudoBackendConfigWithCreds, VerdaBackendConfigWithCreds, BaseDigitalOceanBackendConfigWithCreds, @@ -126,6 +133,7 @@ AWSBackendConfigWithCreds, AzureBackendConfigWithCreds, CloudRiftBackendConfigWithCreds, + CrusoeBackendFileConfigWithCreds, CudoBackendConfigWithCreds, VerdaBackendConfigWithCreds, BaseDigitalOceanBackendConfigWithCreds, diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index 82efe09efa..47552010d4 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -8,6 +8,7 @@ class BackendType(str, enum.Enum): AWS (BackendType): Amazon Web Services AZURE (BackendType): Microsoft Azure CLOUDRIFT (BackendType): CloudRift + CRUSOE (BackendType): Crusoe Cloud CUDO (BackendType): Cudo DATACRUNCH (BackendType): DataCrunch (for backward compatibility) DIGITALOCEAN (BackendType): DigitalOcean @@ -29,6 +30,7 @@ class BackendType(str, enum.Enum): AWS = "aws" AZURE = "azure" CLOUDRIFT = "cloudrift" + CRUSOE = "crusoe" CUDO = "cudo" DATACRUNCH = "datacrunch" # BackendType for backward compatibility DIGITALOCEAN = "digitalocean" diff --git a/src/tests/_internal/server/routers/test_backends.py b/src/tests/_internal/server/routers/test_backends.py index 433c12de30..66c7f4ea36 100644 --- a/src/tests/_internal/server/routers/test_backends.py +++ b/src/tests/_internal/server/routers/test_backends.py @@ -87,6 +87,7 @@ async def test_returns_backend_types(self, client: AsyncClient): "aws", "azure", "cloudrift", + "crusoe", "cudo", *(["datacrunch"] if sys.version_info >= (3, 10) else []), "digitalocean", diff --git a/src/tests/_internal/server/services/test_backend_configs.py b/src/tests/_internal/server/services/test_backend_configs.py index 9ac2e3bbd8..455b38c6e4 100644 --- a/src/tests/_internal/server/services/test_backend_configs.py +++ b/src/tests/_internal/server/services/test_backend_configs.py @@ -13,6 +13,47 @@ ) +class TestCrusoeBackendConfig: + def test_config_parsing(self, tmp_path: Path): + config_yaml_path = tmp_path / "config.yml" + config_dict = { + "projects": [ + { + "name": "main", + "backends": [ + { + "type": "crusoe", + "project_id": "test-project-id", + "regions": ["us-east1-a"], + "creds": { + "type": "access_key", + "access_key": "test-access-key", + "secret_key": "test-secret-key", + }, + } + ], + } + ] + } + config_yaml_path.write_text(yaml.dump(config_dict)) + + with patch.object(settings, "SERVER_CONFIG_FILE_PATH", config_yaml_path): + m = ServerConfigManager() + assert m.load_config() + assert m.config is not None + assert m.config.projects is not None + assert len(m.config.projects) > 0 + assert m.config.projects[0].backends is not None + backend_file_cfg = m.config.projects[0].backends[0] + backend_cfg = file_config_to_config(backend_file_cfg) + + assert backend_cfg.type == "crusoe" + assert backend_cfg.project_id == "test-project-id" + assert backend_cfg.regions == ["us-east1-a"] + assert backend_cfg.creds.access_key == "test-access-key" + assert backend_cfg.creds.secret_key == "test-secret-key" + + @pytest.mark.skipif(sys.version_info < (3, 10), reason="Nebius requires Python 3.10") class TestNebiusBackendConfig: def test_with_filename(self, tmp_path: Path): From 976bf48a78f566e960adbb378eacf5ce3266064b Mon Sep 17 00:00:00 2001 From: Oleg Date: Fri, 27 Feb 2026 13:20:52 +0300 Subject: [PATCH 172/187] [UI]: Refresh button does not refresh run logs, metrics, or events #3476 (#3618) --- frontend/package-lock.json | 20 +++++++++++++++++++ frontend/package.json | 1 + frontend/src/index.tsx | 5 ++++- .../pages/Runs/Details/Events/List/index.tsx | 7 ++++++- .../src/pages/Runs/Details/Inspect/index.tsx | 11 +++++++++- .../pages/Runs/Details/Jobs/Metrics/index.tsx | 13 +++++++++++- .../Details/Jobs/Metrics/useMetricsData.ts | 8 ++++++-- .../src/pages/Runs/Details/Logs/index.tsx | 6 ++++++ .../pages/Runs/Details/RunDetails/index.tsx | 11 ++++++++-- frontend/src/pages/Runs/Details/constants.ts | 2 ++ frontend/src/pages/Runs/Details/index.tsx | 11 +++++++--- 11 files changed, 84 insertions(+), 11 deletions(-) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 65e33fd915..e76e6e8b13 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -30,6 +30,7 @@ "rc-tooltip": "^5.2.2", "react": "^18.3.1", "react-avatar": "^5.0.3", + "react-bus": "^4.0.1", "react-dom": "^18.3.1", "react-helmet": "^6.1.0", "react-hook-form": "^7.53.0", @@ -13904,6 +13905,12 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/mitt": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz", + "integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==", + "license": "MIT" + }, "node_modules/mnth": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/mnth/-/mnth-2.0.0.tgz", @@ -21492,6 +21499,19 @@ "react": "^15.0.0 || ^16.0.0 || ^17.0.0 || ^18.0.0" } }, + "node_modules/react-bus": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/react-bus/-/react-bus-4.0.1.tgz", + "integrity": "sha512-tzPWE23WN0U9v3YaGKAlLW7GXYv1YkCUgWcnzm8HDtfBeD2vDvK8PYHkJVrwMdzg5BleHcxejtdKYfIYZxD7PQ==", + "license": "MIT", + "dependencies": { + "@types/react": "^18.0.8", + "mitt": "^3.0.1" + }, + "peerDependencies": { + "react": ">=17.0.0 || ^19.0.0-0" + } + }, "node_modules/react-dev-utils": { "version": "12.0.1", "resolved": "https://registry.npmjs.org/react-dev-utils/-/react-dev-utils-12.0.1.tgz", diff --git a/frontend/package.json b/frontend/package.json index ae701fc95f..f4dfc7c09e 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -119,6 +119,7 @@ "rc-tooltip": "^5.2.2", "react": "^18.3.1", "react-avatar": "^5.0.3", + "react-bus": "^4.0.1", "react-dom": "^18.3.1", "react-helmet": "^6.1.0", "react-hook-form": "^7.53.0", diff --git a/frontend/src/index.tsx b/frontend/src/index.tsx index 8820bd30d2..c02da1084c 100644 --- a/frontend/src/index.tsx +++ b/frontend/src/index.tsx @@ -1,4 +1,5 @@ import React from 'react'; +import { Provider as BusProvider } from 'react-bus'; import { createRoot } from 'react-dom/client'; import { Provider } from 'react-redux'; import { RouterProvider } from 'react-router-dom'; @@ -36,7 +37,9 @@ if (container) { root.render( - + + + , ); diff --git a/frontend/src/pages/Runs/Details/Events/List/index.tsx b/frontend/src/pages/Runs/Details/Events/List/index.tsx index 79ccb54436..fb6610033e 100644 --- a/frontend/src/pages/Runs/Details/Events/List/index.tsx +++ b/frontend/src/pages/Runs/Details/Events/List/index.tsx @@ -1,4 +1,5 @@ import React from 'react'; +import { useListener } from 'react-bus'; import { useTranslation } from 'react-i18next'; import { useNavigate, useParams } from 'react-router-dom'; import Button from '@cloudscape-design/components/button'; @@ -12,13 +13,15 @@ import { useLazyGetAllEventsQuery } from 'services/events'; import { useColumnsDefinitions } from 'pages/Events/List/hooks/useColumnDefinitions'; +import { RUN_DETAILS_REFRESH_LIST_EVENT } from '../../constants'; + export const EventsList = () => { const { t } = useTranslation(); const params = useParams(); const paramRunId = params.runId ?? ''; const navigate = useNavigate(); - const { data, isLoading, isLoadingMore } = useInfiniteScroll({ + const { data, isLoading, isLoadingMore, refreshList } = useInfiniteScroll({ useLazyQuery: useLazyGetAllEventsQuery, args: { limit: DEFAULT_TABLE_PAGE_SIZE, within_runs: [paramRunId] }, @@ -28,6 +31,8 @@ export const EventsList = () => { }), }); + useListener(RUN_DETAILS_REFRESH_LIST_EVENT, refreshList); + const { items, collectionProps } = useCollection(data, { selection: {}, }); diff --git a/frontend/src/pages/Runs/Details/Inspect/index.tsx b/frontend/src/pages/Runs/Details/Inspect/index.tsx index 5dc9e9a46b..68c4b7bcbb 100644 --- a/frontend/src/pages/Runs/Details/Inspect/index.tsx +++ b/frontend/src/pages/Runs/Details/Inspect/index.tsx @@ -1,4 +1,5 @@ import React, { useEffect, useMemo } from 'react'; +import { useListener } from 'react-bus'; import { useTranslation } from 'react-i18next'; import { useParams } from 'react-router-dom'; @@ -6,6 +7,8 @@ import { CodeEditor, Container, Header, Loader } from 'components'; import { useGetRunQuery } from 'services/run'; +import { RUN_DETAILS_REFRESH_LIST_EVENT } from '../constants'; + interface AceEditorElement extends HTMLElement { env?: { editor?: { @@ -20,11 +23,17 @@ export const RunInspect = () => { const paramProjectName = params.projectName ?? ''; const paramRunId = params.runId ?? ''; - const { data: runData, isLoading } = useGetRunQuery({ + const { + data: runData, + isLoading, + refetch, + } = useGetRunQuery({ project_name: paramProjectName, id: paramRunId, }); + useListener(RUN_DETAILS_REFRESH_LIST_EVENT, refetch); + const jsonContent = useMemo(() => { if (!runData) return ''; return JSON.stringify(runData, null, 2); diff --git a/frontend/src/pages/Runs/Details/Jobs/Metrics/index.tsx b/frontend/src/pages/Runs/Details/Jobs/Metrics/index.tsx index dd088c418b..db89033984 100644 --- a/frontend/src/pages/Runs/Details/Jobs/Metrics/index.tsx +++ b/frontend/src/pages/Runs/Details/Jobs/Metrics/index.tsx @@ -1,4 +1,5 @@ import React, { useEffect, useMemo } from 'react'; +import { useListener } from 'react-bus'; import { useTranslation } from 'react-i18next'; import { useParams } from 'react-router-dom'; @@ -7,6 +8,7 @@ import { Box, ColumnLayout, Container, Header, LineChart } from 'components'; import { riseRouterException } from 'libs'; import { useGetRunQuery } from 'services/run'; +import { RUN_DETAILS_REFRESH_LIST_EVENT } from '../../constants'; import { bytesFormatter, formatPercent, formatTime } from './helpers'; import { useMetricsData } from './useMetricsData'; @@ -32,7 +34,14 @@ export const JobMetrics: React.FC = () => { return runData.jobs.find((job) => job.job_spec.job_name === paramJobName) ?? null; }, [runData]); - const { cpuChartProps, memoryChartProps, eachGPUChartProps, eachGPUMemoryChartProps, isLoading } = useMetricsData({ + const { + cpuChartProps, + memoryChartProps, + eachGPUChartProps, + eachGPUMemoryChartProps, + isLoading, + refetch: refetchMetrics, + } = useMetricsData({ project_name: paramProjectName, run_name: runData?.run_spec.run_name ?? '', run_id: runData?.id ?? '', @@ -40,6 +49,8 @@ export const JobMetrics: React.FC = () => { limit: 1000, }); + useListener(RUN_DETAILS_REFRESH_LIST_EVENT, refetchMetrics); + const statusType = isLoading || isLoadingRun ? 'loading' : 'finished'; useEffect(() => { diff --git a/frontend/src/pages/Runs/Details/Jobs/Metrics/useMetricsData.ts b/frontend/src/pages/Runs/Details/Jobs/Metrics/useMetricsData.ts index 758e3faa33..70bc1d6e5a 100644 --- a/frontend/src/pages/Runs/Details/Jobs/Metrics/useMetricsData.ts +++ b/frontend/src/pages/Runs/Details/Jobs/Metrics/useMetricsData.ts @@ -15,7 +15,11 @@ import { import { bytesFormatter, getChartProps } from './helpers'; export const useMetricsData = (params: TJobMetricsRequestParams) => { - const { data: metricsData, isLoading } = useGetMetricsQuery(params, { + const { + data: metricsData, + isLoading, + refetch, + } = useGetMetricsQuery(params, { skip: !params.run_name, }); @@ -76,5 +80,5 @@ export const useMetricsData = (params: TJobMetricsRequestParams) => { }); }, [metricsData]); - return { cpuChartProps, eachGPUChartProps, memoryChartProps, eachGPUMemoryChartProps, isLoading }; + return { cpuChartProps, eachGPUChartProps, memoryChartProps, eachGPUMemoryChartProps, isLoading, refetch }; }; diff --git a/frontend/src/pages/Runs/Details/Logs/index.tsx b/frontend/src/pages/Runs/Details/Logs/index.tsx index 2c836bb08d..4d12520bba 100644 --- a/frontend/src/pages/Runs/Details/Logs/index.tsx +++ b/frontend/src/pages/Runs/Details/Logs/index.tsx @@ -1,4 +1,5 @@ import React, { useCallback, useEffect, useLayoutEffect, useMemo, useRef, useState } from 'react'; +import { useListener } from 'react-bus'; import { useTranslation } from 'react-i18next'; import { useParams } from 'react-router-dom'; import classNames from 'classnames'; @@ -10,6 +11,7 @@ import { useLazyGetProjectLogsQuery } from 'services/project'; import { useGetRunQuery } from 'services/run'; import { LogRow } from './components/LogRow'; +import { RUN_DETAILS_REFRESH_LIST_EVENT } from '../constants'; import { decodeLogs, getJobSubmissionId } from './helpers'; import { IProps } from './types'; @@ -116,6 +118,10 @@ export const Logs: React.FC = ({ className, projectName, runName, jobSub getLogItems(); }, []); + const refreshLogs = useCallback(() => getLogItems(), []); + + useListener(RUN_DETAILS_REFRESH_LIST_EVENT, refreshLogs); + useLayoutEffect(() => { if (logsForView.length && logsForView.length <= LIMIT_LOG_ROWS) { scrollToBottom(); diff --git a/frontend/src/pages/Runs/Details/RunDetails/index.tsx b/frontend/src/pages/Runs/Details/RunDetails/index.tsx index 408d4cb16b..9a34b6af74 100644 --- a/frontend/src/pages/Runs/Details/RunDetails/index.tsx +++ b/frontend/src/pages/Runs/Details/RunDetails/index.tsx @@ -1,4 +1,5 @@ import React from 'react'; +import { useListener } from 'react-bus'; import { useTranslation } from 'react-i18next'; import { useParams } from 'react-router-dom'; import { format } from 'date-fns'; @@ -28,9 +29,9 @@ import { getRunListItemRegion, getRunListItemResources, getRunListItemSchedule, - getRunListItemServiceUrl, getRunListItemSpotLabelKey, } from '../../List/helpers'; +import { RUN_DETAILS_REFRESH_LIST_EVENT } from '../constants'; import { EventsList } from '../Events/List'; import { JobList } from '../Jobs/List'; import { ConnectToRunWithDevEnvConfiguration } from './ConnectToRunWithDevEnvConfiguration'; @@ -42,11 +43,17 @@ export const RunDetails = () => { const paramProjectName = params.projectName ?? ''; const paramRunId = params.runId ?? ''; - const { data: runData, isLoading: isLoadingRun } = useGetRunQuery({ + const { + data: runData, + isLoading: isLoadingRun, + refetch, + } = useGetRunQuery({ project_name: paramProjectName, id: paramRunId, }); + useListener(RUN_DETAILS_REFRESH_LIST_EVENT, refetch); + const schedule = runData ? getRunListItemSchedule(runData) : null; const nextTriggeredAt = runData ? runData.next_triggered_at : null; diff --git a/frontend/src/pages/Runs/Details/constants.ts b/frontend/src/pages/Runs/Details/constants.ts index 7a63d3f95c..be8b1da878 100644 --- a/frontend/src/pages/Runs/Details/constants.ts +++ b/frontend/src/pages/Runs/Details/constants.ts @@ -5,3 +5,5 @@ export enum CodeTab { Events = 'events', Inspect = 'inspect', } + +export const RUN_DETAILS_REFRESH_LIST_EVENT = 'RUN_DETAILS_REFRESH_LIST_EVENT'; diff --git a/frontend/src/pages/Runs/Details/index.tsx b/frontend/src/pages/Runs/Details/index.tsx index 5195b4fdc0..9a1912a0bf 100644 --- a/frontend/src/pages/Runs/Details/index.tsx +++ b/frontend/src/pages/Runs/Details/index.tsx @@ -1,4 +1,5 @@ import React, { useEffect } from 'react'; +import { useBus } from 'react-bus'; import { useTranslation } from 'react-i18next'; import { Outlet, /*useNavigate,*/ useParams } from 'react-router-dom'; import Button from '@cloudscape-design/components/button'; @@ -15,7 +16,7 @@ import { isAvailableStoppingForRun, // isAvailableDeletingForRun, } from '../utils'; -import { CodeTab } from './constants'; +import { CodeTab, RUN_DETAILS_REFRESH_LIST_EVENT } from './constants'; import styles from './styles.module.scss'; @@ -26,12 +27,12 @@ export const RunDetailsPage: React.FC = () => { const paramProjectName = params.projectName ?? ''; const paramRunId = params.runId ?? ''; const [pushNotification] = useNotifications(); + const bus = useBus(); const { data: runData, error: runError, isLoading, - refetch, } = useGetRunQuery( { project_name: paramProjectName, @@ -108,6 +109,10 @@ export const RunDetailsPage: React.FC = () => { }); }; + const refreshHandle = () => { + bus.emit(RUN_DETAILS_REFRESH_LIST_EVENT); + }; + // const deleteClickHandle = () => { // if (!runData) { // return; @@ -157,7 +162,7 @@ export const RunDetailsPage: React.FC = () => { iconName="refresh" disabled={isLoading} ariaLabel={t('common.refresh')} - onClick={refetch} + onClick={refreshHandle} /> } From 1ee8cd0c11a6e0b4f97773d7339a830a610c5a4e Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Fri, 27 Feb 2026 12:15:52 +0100 Subject: [PATCH 173/187] Update Crusoe cluster docs (#3620) * Changed "Crusoe Cloud" to "Crusoe" in the backend documentation for consistency. * Update docs: feature Crusoe in cluster placement, rewrite cluster example - Add Crusoe tab to fleets.md cluster placement backends - Reorder backends.md to group Crusoe with other major cloud providers - Rewrite examples/clusters/crusoe VMs section to use native backend instead of SSH fleets, with tested NCCL config Made-with: Cursor * Refactor documentation: Remove partners.md, update mkdocs.yml, and add Crusoe logo * Linter fix --- docs/assets/images/crusoe-logo.svg | 3 + docs/docs/concepts/backends.md | 230 ++++++++-------- docs/docs/concepts/fleets.md | 11 +- docs/overrides/home.html | 24 +- docs/partners.md | 139 ---------- examples/clusters/crusoe/README.md | 246 ++++++++---------- mkdocs.yml | 3 +- .../_internal/core/backends/crusoe/models.py | 2 +- .../_internal/core/models/backends/base.py | 2 +- 9 files changed, 253 insertions(+), 407 deletions(-) create mode 100644 docs/assets/images/crusoe-logo.svg delete mode 100644 docs/partners.md diff --git a/docs/assets/images/crusoe-logo.svg b/docs/assets/images/crusoe-logo.svg new file mode 100644 index 0000000000..f973485ed7 --- /dev/null +++ b/docs/assets/images/crusoe-logo.svg @@ -0,0 +1,3 @@ + + + diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index 5446cef160..bf731823fa 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -745,9 +745,10 @@ projects: Nebius is only supported if `dstack server` is running on Python 3.10 or higher. -### Vultr +### Crusoe -Log into your [Vultr](https://www.vultr.com/) account, click `Account` in the sidebar, select `API`, find the `Personal Access Token` panel and click the `Enable API` button. In the `Access Control` panel, allow API requests from all addresses or from the subnet where your `dstack` server is deployed. +Log into your [Crusoe](https://console.crusoecloud.com/) console and create an API key +under your account settings. Note your project ID from the project settings page. Then, go ahead and configure the backend: @@ -755,89 +756,22 @@ Then, go ahead and configure the backend: ```yaml projects: - - name: main - backends: - - type: vultr - creds: - type: api_key - api_key: B57487240a466624b48de22865589 +- name: main + backends: + - type: crusoe + project_id: your-project-id + creds: + type: access_key + access_key: your-access-key + secret_key: your-secret-key + regions: + - us-east1-a + - us-southcentral1-a ```
    -### OCI - -There are two ways to configure OCI: using client credentials or using the default credentials. - -=== "Default credentials" - If you have default credentials set up in `~/.oci/config`, configure the backend like this: - -
    - - ```yaml - projects: - - name: main - backends: - - type: oci - creds: - type: default - ``` - -
    - -=== "Client credentials" - - Log into the [OCI Console](https://cloud.oracle.com), go to `My profile`, - select `API keys`, and click `Add API key`. - - Once you add a key, you'll see the configuration file. Copy its values to configure the backend as follows: - -
    - - ```yaml - projects: - - name: main - backends: - - type: oci - creds: - type: client - user: ocid1.user.oc1..g5vlaeqfu47akmaafq665xsgmyaqjktyfxtacfxc4ftjxuca7aohnd2ev66m - tenancy: ocid1.tenancy.oc1..ajqsftvk4qarcfaak3ha4ycdsaahxmaita5frdwg3tqo2bcokpd3n7oizwai - region: eu-frankfurt-1 - fingerprint: 77:32:77:00:49:7c:cb:56:84:75:8e:77:96:7d:53:17 - key_file: ~/.oci/private_key.pem - ``` - -
    - - Make sure to include either the path to your private key via `key_file` or the contents of the key via `key_content`. - -??? info "Required permissions" - - This is an example of a restrictive policy for a group of `dstack` users: - - ``` - Allow group to read compartments in tenancy where target.compartment.name = '' - Allow group to read marketplace-community-listings in compartment - Allow group to manage app-catalog-listing in compartment - Allow group to manage instances in compartment - Allow group to manage compute-capacity-reports in compartment - Allow group to manage volumes in compartment - Allow group to manage volume-attachments in compartment - Allow group to manage virtual-network-family in compartment - ``` - - To use this policy, create a compartment for `dstack` and specify it in `~/.dstack/server/config.yml`. - - ```yaml - projects: - - name: main - backends: - - type: oci - creds: - type: default - compartment_id: ocid1.compartment.oc1..aaaaaaaa - ``` +`regions` is optional. If not specified, all available Crusoe regions are used. @@ -929,34 +863,6 @@ projects: * `sizes` - read * `ssh_key` - create, read, update,delete -### Crusoe Cloud - -Log into your [Crusoe Cloud](https://console.crusoecloud.com/) console and create an API key -under your account settings. Note your project ID from the project settings page. - -Then, go ahead and configure the backend: - -
    - -```yaml -projects: -- name: main - backends: - - type: crusoe - project_id: your-project-id - creds: - type: access_key - access_key: your-access-key - secret_key: your-secret-key - regions: - - us-east1-a - - us-southcentral1-a -``` - -
    - -`regions` is optional. If not specified, all available Crusoe regions are used. - ### Hot Aisle Log in to the SSH TUI as described in the [Hot Aisle Quick Start](https://hotaisle.xyz/quick-start/). @@ -1008,6 +914,106 @@ projects: +### Vultr + +Log into your [Vultr](https://www.vultr.com/) account, click `Account` in the sidebar, select `API`, find the `Personal Access Token` panel and click the `Enable API` button. In the `Access Control` panel, allow API requests from all addresses or from the subnet where your `dstack` server is deployed. + +Then, go ahead and configure the backend: + +
    + +```yaml +projects: + - name: main + backends: + - type: vultr + creds: + type: api_key + api_key: B57487240a466624b48de22865589 +``` + +
    + +### OCI + +There are two ways to configure OCI: using client credentials or using the default credentials. + +=== "Default credentials" + If you have default credentials set up in `~/.oci/config`, configure the backend like this: + +
    + + ```yaml + projects: + - name: main + backends: + - type: oci + creds: + type: default + ``` + +
    + +=== "Client credentials" + + Log into the [OCI Console](https://cloud.oracle.com), go to `My profile`, + select `API keys`, and click `Add API key`. + + Once you add a key, you'll see the configuration file. Copy its values to configure the backend as follows: + +
    + + ```yaml + projects: + - name: main + backends: + - type: oci + creds: + type: client + user: ocid1.user.oc1..g5vlaeqfu47akmaafq665xsgmyaqjktyfxtacfxc4ftjxuca7aohnd2ev66m + tenancy: ocid1.tenancy.oc1..ajqsftvk4qarcfaak3ha4ycdsaahxmaita5frdwg3tqo2bcokpd3n7oizwai + region: eu-frankfurt-1 + fingerprint: 77:32:77:00:49:7c:cb:56:84:75:8e:77:96:7d:53:17 + key_file: ~/.oci/private_key.pem + ``` + +
    + + Make sure to include either the path to your private key via `key_file` or the contents of the key via `key_content`. + +??? info "Required permissions" + + This is an example of a restrictive policy for a group of `dstack` users: + + ``` + Allow group to read compartments in tenancy where target.compartment.name = '' + Allow group to read marketplace-community-listings in compartment + Allow group to manage app-catalog-listing in compartment + Allow group to manage instances in compartment + Allow group to manage compute-capacity-reports in compartment + Allow group to manage volumes in compartment + Allow group to manage volume-attachments in compartment + Allow group to manage virtual-network-family in compartment + ``` + + To use this policy, create a compartment for `dstack` and specify it in `~/.dstack/server/config.yml`. + + ```yaml + projects: + - name: main + backends: + - type: oci + creds: + type: default + compartment_id: ocid1.compartment.oc1..aaaaaaaa + ``` + +SSH fleets support the same features as [VM-based](#vm-based) backends. + +!!! info "What's next" + 1. See the [`~/.dstack/server/config.yml`](../reference/server/config.yml.md) reference + 2. Check [Projects](../concepts/projects.md) + ## Container-based Container-based backends allow `dstack` to orchestrate container-based runs either directly on cloud providers that support containers or on Kubernetes. @@ -1206,9 +1212,3 @@ projects: Also, the `vastai` backend supports on-demand instances only. Spot instance support coming soon. - -SSH fleets support the same features as [VM-based](#vm-based) backends. - -!!! info "What's next" - 1. See the [`~/.dstack/server/config.yml`](../reference/server/config.yml.md) reference - 2. Check [Projects](../concepts/projects.md) diff --git a/docs/docs/concepts/fleets.md b/docs/docs/concepts/fleets.md index 02308a649a..027ea14ed0 100644 --- a/docs/docs/concepts/fleets.md +++ b/docs/docs/concepts/fleets.md @@ -160,7 +160,7 @@ This property ensures that instances are interconnected. This is required for ru #### Backends - Fast interconnect is supported on the `aws`, `gcp`, `nebius`, `kubernetes`, and `runpod` backends. Some backends may require additional configuration. + Fast interconnect is supported on the `aws`, `gcp`, `nebius`, `crusoe`, and `kubernetes` backends. Some backends may require additional configuration. === "AWS" On AWS, `dstack` requires `public_ips` to be set to `false` in the backend configuration. @@ -173,15 +173,18 @@ This property ensures that instances are interconnected. This is required for ru === "Nebius" On [Nebius](https://docs.nebius.com/compute/clusters/gpu), `dstack` automatically configures InfiniBand networking if it is supported by the selected instance type. + === "Crusoe" + On [Crusoe](https://docs.crusoecloud.com/networking/infiniband/managing-infiniband-networks), `dstack` automatically configures InfiniBand networking if it is supported by the selected instance type. + Refer to the [Crusoe](../../examples/clusters/crusoe/index.md#vms) example for more details. + === "Kubernetes" If the Kubernetes cluster has interconnect configured, `dstack` can use it without additional setup. See the [Lambda](../../examples/clusters/lambda/index.md#kubernetes) or [Crusoe](../../examples/clusters/crusoe/index.md#kubernetes) examples. - === "Runpod" - On [Runpod](https://docs.runpod.io/instant-clusters), `dstack` automatically configures InfiniBand networking if it is supported by the selected instance type. - > See the [Clusters](../../examples.md#clusters) examples. + + === "SSH fleets" If the hosts in the SSH fleet have interconnect configured, you only need to set `placement` to `cluster`. diff --git a/docs/overrides/home.html b/docs/overrides/home.html index 05f5468198..a9aec887ad 100644 --- a/docs/overrides/home.html +++ b/docs/overrides/home.html @@ -135,32 +135,32 @@

    A unified control plane for GPU orchestration

    - - + + - - + + - - + + - - + + - - + + - - + + diff --git a/docs/partners.md b/docs/partners.md deleted file mode 100644 index 44295691a8..0000000000 --- a/docs/partners.md +++ /dev/null @@ -1,139 +0,0 @@ ---- -#template: backends.html -hide: - - navigation -# - toc - - footer ---- - - - -> Here are the cloud GPU providers that `dstack` integrates with and collaborates with. - -## NVIDIA - - - -## AMD - - - -[//]: # (## Google TPU) - - - -## Join the ecosystem - -Are you interested in integrating and collaborating with us to become part of the ecosystem? - - - Contribute -   - Talk to an expert - diff --git a/examples/clusters/crusoe/README.md b/examples/clusters/crusoe/README.md index 50ec88e461..ed416ae3e7 100644 --- a/examples/clusters/crusoe/README.md +++ b/examples/clusters/crusoe/README.md @@ -1,15 +1,76 @@ --- title: Crusoe -description: Using Crusoe clusters with InfiniBand support via Kubernetes or VMs +description: Using Crusoe clusters with InfiniBand support via VMs or Kubernetes --- # Crusoe `dstack` allows using Crusoe clusters with fast interconnect via two ways: +* [VMs](#vms) – If you configure a `crusoe` backend in `dstack` by providing your Crusoe credentials, `dstack` lets you fully provision and use clusters through `dstack`. * [Kubernetes](#kubernetes) – If you create a Kubernetes cluster on Crusoe and configure a `kubernetes` backend and create a backend fleet in `dstack`, `dstack` lets you fully use this cluster through `dstack`. -* [VMs](#vms) – If you create a VM cluster on Crusoe and create an SSH fleet in `dstack`, `dstack` lets you fully use this cluster through `dstack`. - + +## VMs + +Since `dstack` offers a VM-based backend that natively integrates with Crusoe, you only need to provide your Crusoe credentials to `dstack`, and it will allow you to fully provision and use clusters on Crusoe through `dstack`. + +### Configure a backend + +Log into your [Crusoe](https://console.crusoecloud.com/) console, create an API key under your account settings, and note your project ID. + +
    + +```yaml +projects: +- name: main + backends: + - type: crusoe + project_id: your-project-id + creds: + type: access_key + access_key: your-access-key + secret_key: your-secret-key +``` + +
    + +### Create a fleet + +Once the backend is configured, you can create a fleet: + +
    + +```yaml +type: fleet +name: crusoe-fleet + +nodes: 2 +placement: cluster + +backends: [crusoe] + +resources: + gpu: A100:80GB:8 +``` + +
    + +Pass the fleet configuration to `dstack apply`: + +
    + +```shell +$ dstack apply -f crusoe-fleet.dstack.yml +``` + +
    + +This will automatically create an IB partition and provision instances with InfiniBand networking. + +Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). + +> If you want instances to be provisioned on demand, you can set `nodes` to `0..2`. In this case, `dstack` will create instances only when you run workloads. + ## Kubernetes ### Create a cluster @@ -74,56 +135,68 @@ $ dstack apply -f crusoe-fleet.dstack.yml Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). -## VMs - -Another way to work with Crusoe clusters is through VMs. While `dstack` typically supports VM-based compute providers via [dedicated backends](https://dstack.ai/docs/concepts/backends#vm-based) that automate provisioning, Crusoe does not yet have [such a backend](https://github.com/dstackai/dstack/issues/3378). As a result, to use a VM-based Crusoe cluster with `dstack`, you should use [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh-fleets). - -### Create instances - -1. Go to `Compute`, then `Instances`, and click `Create Instance`. Make sure to select the right instance type and VM image (that [support interconnect](https://docs.crusoecloud.com/networking/infiniband/managing-infiniband-networks/index.html)). Make sure to create as many instances as needed. +## NCCL tests -### Create a `dstack` fleet +Use a [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks) that runs NCCL tests to validate cluster network bandwidth. -Follow the standard instructions for setting up an [SSH fleet](https://dstack.ai/docs/concepts/fleets/#ssh-fleets): +=== "VMs" -
    - -```yaml -type: fleet -name: crusoe-fleet + With the Crusoe backend, HPC-X and NCCL topology files are pre-installed on the host VM image. Mount them into the container via [instance volumes](https://dstack.ai/docs/concepts/volumes#instance-volumes). -placement: cluster +
    -# SSH credentials for the on-prem servers -ssh_config: - user: ubuntu - identity_file: ~/.ssh/id_rsa - hosts: - - 3.255.177.51 - - 3.255.177.52 -``` - -
    + ```yaml + type: task + name: nccl-tests -Pass the fleet configuration to `dstack apply`: + nodes: 2 + startup_order: workers-first + stop_criteria: master-done -
    + volumes: + - /opt/hpcx:/opt/hpcx + - /etc/crusoe/nccl_topo:/etc/crusoe/nccl_topo -```shell -$ dstack apply -f crusoe-fleet.dstack.yml -``` + commands: + - . /opt/hpcx/hpcx-init.sh + - hpcx_load + - | + if [ $DSTACK_NODE_RANK -eq 0 ]; then + mpirun \ + --allow-run-as-root \ + --hostfile $DSTACK_MPI_HOSTFILE \ + -n $DSTACK_GPUS_NUM \ + -N $DSTACK_GPUS_PER_NODE \ + --bind-to none \ + -mca btl tcp,self \ + -mca coll_hcoll_enable 0 \ + -x PATH \ + -x LD_LIBRARY_PATH \ + -x CUDA_DEVICE_ORDER=PCI_BUS_ID \ + -x NCCL_SOCKET_NTHREADS=4 \ + -x NCCL_NSOCKS_PERTHREAD=8 \ + -x NCCL_TOPO_FILE=/etc/crusoe/nccl_topo/a100-80gb-sxm-ib-cloud-hypervisor.xml \ + -x NCCL_IB_MERGE_VFS=0 \ + -x NCCL_IB_HCA=^mlx5_0:1 \ + /opt/nccl-tests/build/all_reduce_perf -b 8 -e 2G -f 2 -t 1 -g 1 -c 1 -n 100 + else + sleep infinity + fi -
    + backends: [crusoe] -Once the fleet is created, you can run [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), and [services](https://dstack.ai/docs/concepts/services). + resources: + gpu: A100:80GB:8 + shm_size: 16GB + ``` -## NCCL tests +
    -Use a [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks) that runs NCCL tests to validate cluster network bandwidth. + > Update `NCCL_TOPO_FILE` to match your instance type. Topology files for all supported types are available at `/etc/crusoe/nccl_topo/` on the host. -=== "Crusoe Managed Kubernetes" +=== "Kubernetes" - If you’re running on Crusoe Managed Kubernetes, make sure to install HPC-X and provide an up-to-date topology file. + If you're running on Crusoe Managed Kubernetes, make sure to install HPC-X and provide an up-to-date topology file.
    @@ -190,105 +263,12 @@ Use a [distributed task](https://dstack.ai/docs/concepts/tasks#distributed-tasks ??? info "Privileged" When running on Crusoe Managed Kubernetes, set `privileged` to `true` to ensure access to InfiniBand. -=== "VMs" - -With Crusoe VMs, HPC-X and up-to-date topology files are already available on the hosts. When using SSH fleets, simply mount them via [instance volumes](https://dstack.ai/docs/concepts/volumes#instance-volumes). - -```yaml -type: task -name: nccl-tests - -nodes: 2 -startup_order: workers-first -stop_criteria: master-done - -volumes: - - /opt/hpcx:/opt/hpcx - - /etc/crusoe/nccl_topo:/etc/crusoe/nccl_topo - -commands: - - . /opt/hpcx/hpcx-init.sh - - hpcx_load - # Run NCCL Tests - - | - if [ $DSTACK_NODE_RANK -eq 0 ]; then - mpirun \ - --allow-run-as-root \ - --hostfile $DSTACK_MPI_HOSTFILE \ - -n $DSTACK_GPUS_NUM \ - -N $DSTACK_GPUS_PER_NODE \ - --bind-to none \ - -mca btl tcp,self \ - -mca coll_hcoll_enable 0 \ - -x PATH \ - -x LD_LIBRARY_PATH \ - -x CUDA_DEVICE_ORDER=PCI_BUS_ID \ - -x NCCL_SOCKET_NTHREADS=4 \ - -x NCCL_NSOCKS_PERTHREAD=8 \ - -x NCCL_TOPO_FILE=/etc/crusoe/nccl_topo/a100-80gb-sxm-ib-cloud-hypervisor.xml \ - -x NCCL_IB_MERGE_VFS=0 \ - -x NCCL_IB_AR_THRESHOLD=0 \ - -x NCCL_IB_PCI_RELAXED_ORDERING=1 \ - -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \ - -x NCCL_IB_QPS_PER_CONNECTION=2 \ - -x NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1 \ - -x UCX_NET_DEVICES=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1 \ - /opt/nccl-tests/build/all_reduce_perf -b 8 -e 2G -f 2 -t 1 -g 1 -c 1 -n 100 - else - sleep infinity - fi - -resources: - gpu: A100:8 - shm_size: 16GB -``` - Pass the configuration to `dstack apply`:
    ```shell $ dstack apply -f crusoe-nccl-tests.dstack.yml - -Provisioning... ----> 100% - -nccl-tests provisioning completed (running) - -out-of-place in-place - size count type redop root time algbw busbw #wrong time algbw busbw #wrong - (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) - 8 2 float sum -1 27.70 0.00 0.00 0 29.82 0.00 0.00 0 - 16 4 float sum -1 28.78 0.00 0.00 0 28.99 0.00 0.00 0 - 32 8 float sum -1 28.49 0.00 0.00 0 28.16 0.00 0.00 0 - 64 16 float sum -1 28.41 0.00 0.00 0 28.69 0.00 0.00 0 - 128 32 float sum -1 28.94 0.00 0.01 0 28.58 0.00 0.01 0 - 256 64 float sum -1 29.46 0.01 0.02 0 29.45 0.01 0.02 0 - 512 128 float sum -1 30.23 0.02 0.03 0 29.85 0.02 0.03 0 - 1024 256 float sum -1 30.79 0.03 0.06 0 34.03 0.03 0.06 0 - 2048 512 float sum -1 37.90 0.05 0.10 0 33.22 0.06 0.12 0 - 4096 1024 float sum -1 35.91 0.11 0.21 0 35.30 0.12 0.22 0 - 8192 2048 float sum -1 36.84 0.22 0.42 0 38.30 0.21 0.40 0 - 16384 4096 float sum -1 47.08 0.35 0.65 0 37.26 0.44 0.82 0 - 32768 8192 float sum -1 45.20 0.72 1.36 0 48.70 0.67 1.26 0 - 65536 16384 float sum -1 49.43 1.33 2.49 0 50.97 1.29 2.41 0 - 131072 32768 float sum -1 51.08 2.57 4.81 0 50.17 2.61 4.90 0 - 262144 65536 float sum -1 192.78 1.36 2.55 0 100.00 2.62 4.92 0 - 524288 131072 float sum -1 68.02 7.71 14.45 0 69.40 7.55 14.16 0 - 1048576 262144 float sum -1 81.71 12.83 24.06 0 88.58 11.84 22.20 0 - 2097152 524288 float sum -1 113.03 18.55 34.79 0 102.21 20.52 38.47 0 - 4194304 1048576 float sum -1 123.50 33.96 63.68 0 131.71 31.84 59.71 0 - 8388608 2097152 float sum -1 189.42 44.29 83.04 0 183.01 45.84 85.95 0 - 16777216 4194304 float sum -1 274.05 61.22 114.79 0 265.91 63.09 118.30 0 - 33554432 8388608 float sum -1 490.77 68.37 128.20 0 490.53 68.40 128.26 0 - 67108864 16777216 float sum -1 854.62 78.52 147.23 0 853.49 78.63 147.43 0 - 134217728 33554432 float sum -1 1483.43 90.48 169.65 0 1479.22 90.74 170.13 0 - 268435456 67108864 float sum -1 2700.36 99.41 186.39 0 2700.49 99.40 186.38 0 - 536870912 134217728 float sum -1 5300.49 101.29 189.91 0 5314.91 101.01 189.40 0 - 1073741824 268435456 float sum -1 10472.2 102.53 192.25 0 10485.6 102.40 192.00 0 - 2147483648 536870912 float sum -1 20749.1 103.50 194.06 0 20745.7 103.51 194.09 0 - Out of bounds values : 0 OK - Avg bus bandwidth : 53.7387 ```
    @@ -296,5 +276,5 @@ out-of-place in-place ## What's next 1. Learn about [dev environments](https://dstack.ai/docs/concepts/dev-environments), [tasks](https://dstack.ai/docs/concepts/tasks), [services](https://dstack.ai/docs/concepts/services) -2. Read the [Kuberentes](https://dstack.ai/docs/guides/kubernetes), and [Clusters](https://dstack.ai/docs/guides/clusters) guides +2. Check out [backends](https://dstack.ai/docs/concepts/backends#crusoe-cloud) and [fleets](https://dstack.ai/docs/concepts/fleets#cloud-fleets) 3. Check the docs on [Crusoe's networking](https://docs.crusoecloud.com/networking/infiniband/) and ["Crusoe Managed" Kubernetes](https://docs.crusoecloud.com/orchestration/cmk/index.html) diff --git a/mkdocs.yml b/mkdocs.yml index 58c176f3db..de82cba605 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -106,8 +106,7 @@ plugins: "docs/examples/deployment/nim/index.md": "examples/inference/nim/index.md" "docs/examples/deployment/vllm/index.md": "examples/inference/vllm/index.md" "docs/examples/deployment/tgi/index.md": "examples/inference/tgi/index.md" - "providers.md": "partners.md" - "backends.md": "partners.md" + "backends.md": "docs/concepts/backends.md" "blog/monitoring-gpu-usage.md": "blog/posts/dstack-metrics.md" "blog/inactive-dev-environments-auto-shutdown.md": "blog/posts/inactivity-duration.md" "blog/data-centers-and-private-clouds.md": "blog/posts/gpu-blocks-and-proxy-jump.md" diff --git a/src/dstack/_internal/core/backends/crusoe/models.py b/src/dstack/_internal/core/backends/crusoe/models.py index f405eca1b7..d867301c0d 100644 --- a/src/dstack/_internal/core/backends/crusoe/models.py +++ b/src/dstack/_internal/core/backends/crusoe/models.py @@ -22,7 +22,7 @@ class CrusoeBackendConfig(CoreModel): Literal["crusoe"], Field(description="The type of backend"), ] = "crusoe" - project_id: Annotated[str, Field(description="The Crusoe Cloud project ID")] + project_id: Annotated[str, Field(description="The Crusoe project ID")] regions: Annotated[ Optional[List[str]], Field(description="The list of allowed Crusoe regions. Omit to use all regions"), diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index 47552010d4..ba382a0b66 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -8,7 +8,7 @@ class BackendType(str, enum.Enum): AWS (BackendType): Amazon Web Services AZURE (BackendType): Microsoft Azure CLOUDRIFT (BackendType): CloudRift - CRUSOE (BackendType): Crusoe Cloud + CRUSOE (BackendType): Crusoe CUDO (BackendType): Cudo DATACRUNCH (BackendType): DataCrunch (for backward compatibility) DIGITALOCEAN (BackendType): DigitalOcean From 84e2c70444215e2b8a432e2a74d6ca965290c463 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Sat, 28 Feb 2026 00:01:51 +0100 Subject: [PATCH 174/187] Report runtime `working_dir` and `username` from `runner` via `JobRuntimeData` (#3617) * Report working_dir and username from runner to server via JobRuntimeData The runner now reports the resolved working directory and OS username back to the server through the PullResponse. The server persists these in JobRuntimeData (write-once), and the frontend uses job_runtime_data.working_dir to construct correct IDE deep-link URLs instead of the hardcoded legacy /workflow path. Made-with: Cursor * Return new fields in /api/run response /api/pull is too late, we need these fields as soon as the job state is switched to RUNNING * [UI] In the run details page, for the dev environment, expect `latestSubmission?.job_runtime_data?.working_dir` after `run.status === 'running'`; otherwise, fallback to `/`. --------- Co-authored-by: Dmitry Meyer --- .../index.tsx | 4 +- frontend/src/types/run.d.ts | 6 + runner/internal/executor/base.go | 1 + runner/internal/executor/executor.go | 116 +++++++++++++----- runner/internal/runner/api/http.go | 13 +- runner/internal/schemas/schemas.go | 6 +- src/dstack/_internal/core/models/runs.py | 4 + .../scheduled_tasks/running_jobs.py | 8 +- src/dstack/_internal/server/schemas/runner.py | 5 + .../server/services/runner/client.py | 14 ++- src/dstack/_internal/server/testing/common.py | 4 + src/dstack/api/_public/runs.py | 6 +- .../scheduled_tasks/test_running_jobs.py | 16 ++- 13 files changed, 162 insertions(+), 41 deletions(-) diff --git a/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx b/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx index af63e9c67b..54d03c388d 100644 --- a/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx +++ b/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx @@ -54,7 +54,9 @@ export const ConnectToRunWithDevEnvConfiguration: FC<{ run: IRun }> = ({ run }) const [sshCommand, copySSHCommand] = getSSHCommand(run); const configuration = run.run_spec.configuration as TDevEnvironmentConfiguration; - const openInIDEUrl = `${configuration.ide}://vscode-remote/ssh-remote+${run.run_spec.run_name}/${run.run_spec.working_dir || 'workflow'}`; + const latestSubmission = run.jobs[0]?.job_submissions?.slice(-1)[0]; + const workingDir = latestSubmission?.job_runtime_data?.working_dir ?? '/'; + const openInIDEUrl = `${configuration.ide}://vscode-remote/ssh-remote+${run.run_spec.run_name}${workingDir}`; const ideDisplayName = getIDEDisplayName(configuration.ide); const [configCliCommand, copyCliCommand] = useConfigProjectCliCommand({ projectName: run.project_name }); diff --git a/frontend/src/types/run.d.ts b/frontend/src/types/run.d.ts index 3eac746218..928a022804 100644 --- a/frontend/src/types/run.d.ts +++ b/frontend/src/types/run.d.ts @@ -293,9 +293,15 @@ declare interface IJobProvisioningData { backend_data?: string; } +declare interface IJobRuntimeData { + working_dir?: string | null; + username?: string | null; +} + declare interface IJobSubmission { id: string; job_provisioning_data?: IJobProvisioningData | null; + job_runtime_data?: IJobRuntimeData | null; error_code?: TJobErrorCode | null; submission_num: number; status: TJobStatus; diff --git a/runner/internal/executor/base.go b/runner/internal/executor/base.go index 99e32250cb..fac1266fb0 100644 --- a/runner/internal/executor/base.go +++ b/runner/internal/executor/base.go @@ -12,6 +12,7 @@ type Executor interface { GetHistory(timestamp int64) *schemas.PullResponse GetJobWsLogsHistory() []schemas.LogEvent GetRunnerState() string + GetJobInfo(ctx context.Context) (username string, workingDir string, err error) Run(ctx context.Context) error SetJob(job schemas.SubmitBody) SetJobState(ctx context.Context, state types.JobState) diff --git a/runner/internal/executor/executor.go b/runner/internal/executor/executor.go index 61e18ee3e9..311eddaa10 100644 --- a/runner/internal/executor/executor.go +++ b/runner/internal/executor/executor.go @@ -21,6 +21,7 @@ import ( "github.com/creack/pty" "github.com/dstackai/ansistrip" "github.com/prometheus/procfs" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" "github.com/dstackai/dstack/runner/consts" @@ -61,6 +62,10 @@ type RunExecutor struct { fileArchiveDir string repoBlobDir string + runnerLogFile *os.File + runnerLogStripper *ansistrip.Writer + runnerLogger *logrus.Entry + run schemas.Run jobSpec schemas.JobSpec jobSubmission schemas.JobSubmission @@ -136,14 +141,26 @@ func NewRunExecutor(tempDir string, dstackDir string, currentUser linuxuser.User }, nil } +// GetJobInfo must be called after SetJob +func (ex *RunExecutor) GetJobInfo(ctx context.Context) (string, string, error) { + // preRun() sets ex.jobUser and ex.jobWorkingDir + if err := ex.preRun(ctx); err != nil { + return "", "", err + } + return ex.jobUser.Username, ex.jobWorkingDir, nil +} + // Run must be called after SetJob and WriteRepoBlob func (ex *RunExecutor) Run(ctx context.Context) (err error) { - runnerLogFile, err := log.CreateAppendFile(filepath.Join(ex.tempDir, consts.RunnerLogFileName)) - if err != nil { - ex.SetJobState(ctx, types.JobStateFailed) - return fmt.Errorf("create runner log file: %w", err) + // If jobStateHistory is not empty, either Run() has already been called or + // preRun() has already been called via GetJobInfo() and failed + if len(ex.jobStateHistory) > 0 { + return errors.New("already running or finished") + } + if err := ex.preRun(ctx); err != nil { + return err } - defer func() { _ = runnerLogFile.Close() }() + defer ex.postRun(ctx) jobLogFile, err := log.CreateAppendFile(filepath.Join(ex.tempDir, consts.RunnerJobLogFileName)) if err != nil { @@ -153,7 +170,7 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { defer func() { _ = jobLogFile.Close() }() defer func() { - // recover goes after runnerLogFile.Close() to keep the log + // recover goes after postRun(), which closes runnerLogFile, to keep the log if r := recover(); r != nil { log.Error(ctx, "Executor PANIC", "err", r) ex.SetJobState(ctx, types.JobStateFailed) @@ -171,21 +188,8 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { } }() - stripper := ansistrip.NewWriter(ex.runnerLogs, AnsiStripFlushInterval, AnsiStripMaxDelay, MaxBufferSize) - defer func() { _ = stripper.Close() }() - logger := io.MultiWriter(runnerLogFile, os.Stdout, stripper) - ctx = log.WithLogger(ctx, log.NewEntry(logger, int(log.DefaultEntry.Logger.Level))) // todo loglevel - log.Info(ctx, "Run job", "log_level", log.GetLogger(ctx).Logger.Level.String()) - - if err := ex.setJobUser(ctx); err != nil { - ex.SetJobStateWithTerminationReason( - ctx, - types.JobStateFailed, - types.TerminationReasonExecutorError, - fmt.Sprintf("Failed to set job user (%s)", err), - ) - return fmt.Errorf("set job user: %w", err) - } + ctx = log.WithLogger(ctx, ex.runnerLogger) + log.Info(ctx, "Run job") // setJobUser sets User.HomeDir to "/" if the original home dir is not set or not accessible, // in that case we skip home dir provisioning @@ -204,16 +208,6 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { } } - if err := ex.setJobWorkingDir(ctx); err != nil { - ex.SetJobStateWithTerminationReason( - ctx, - types.JobStateFailed, - types.TerminationReasonExecutorError, - fmt.Sprintf("Failed to set job working dir (%s)", err), - ) - return fmt.Errorf("set job working dir: %w", err) - } - if err := ex.setupRepo(ctx); err != nil { ex.SetJobStateWithTerminationReason( ctx, @@ -336,6 +330,66 @@ func (ex *RunExecutor) SetRunnerState(state string) { ex.state = state } +// preRun performs actions that were once part of Run() but were moved to a separate function +// to implement GetJobInfo() +// preRun must not execute long-running operations, as GetJobInfo() is called synchronously +// in the /api/run method +func (ex *RunExecutor) preRun(ctx context.Context) error { + // Already called once + if ex.runnerLogFile != nil { + return nil + } + + // logging is required for the subsequent setJob{User,WorkingDir} calls + runnerLogFile, err := log.CreateAppendFile(filepath.Join(ex.tempDir, consts.RunnerLogFileName)) + if err != nil { + ex.SetJobState(ctx, types.JobStateFailed) + return fmt.Errorf("create runner log file: %w", err) + } + ex.runnerLogFile = runnerLogFile + ex.runnerLogStripper = ansistrip.NewWriter(ex.runnerLogs, AnsiStripFlushInterval, AnsiStripMaxDelay, MaxBufferSize) + runnerLogWriter := io.MultiWriter(ex.runnerLogFile, os.Stdout, ex.runnerLogStripper) + runnerLogLevel := log.DefaultEntry.Logger.Level + ex.runnerLogger = log.NewEntry(runnerLogWriter, int(runnerLogLevel)) + ctx = log.WithLogger(ctx, ex.runnerLogger) + log.Info(ctx, "Logging configured", "log_level", runnerLogLevel.String()) + + // jobUser and jobWorkingDir are required for GetJobInfo() + if err := ex.setJobUser(ctx); err != nil { + ex.SetJobStateWithTerminationReason( + ctx, + types.JobStateFailed, + types.TerminationReasonExecutorError, + fmt.Sprintf("Failed to set job user (%s)", err), + ) + return fmt.Errorf("set job user: %w", err) + } + if err := ex.setJobWorkingDir(ctx); err != nil { + ex.SetJobStateWithTerminationReason( + ctx, + types.JobStateFailed, + types.TerminationReasonExecutorError, + fmt.Sprintf("Failed to set job working dir (%s)", err), + ) + return fmt.Errorf("set job working dir: %w", err) + } + + return nil +} + +func (ex *RunExecutor) postRun(ctx context.Context) { + if ex.runnerLogFile != nil { + if err := ex.runnerLogFile.Close(); err != nil { + log.Error(ctx, "Failed to close runnerLogFile", "err", err) + } + } + if ex.runnerLogStripper != nil { + if err := ex.runnerLogStripper.Close(); err != nil { + log.Error(ctx, "Failed to close runnerLogStripper", "err", err) + } + } +} + // setJobWorkingDir must be called from Run after setJobUser func (ex *RunExecutor) setJobWorkingDir(ctx context.Context) error { var err error diff --git a/runner/internal/runner/api/http.go b/runner/internal/runner/api/http.go index 87eb96e0af..4d1c7daf54 100644 --- a/runner/internal/runner/api/http.go +++ b/runner/internal/runner/api/http.go @@ -146,18 +146,27 @@ func (s *Server) uploadCodePostHandler(w http.ResponseWriter, r *http.Request) ( func (s *Server) runPostHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) { s.executor.Lock() - defer s.executor.Unlock() if s.executor.GetRunnerState() != executor.WaitRun { + s.executor.Unlock() return nil, &api.Error{Status: http.StatusConflict} } + s.executor.SetRunnerState(executor.ServeLogs) + s.executor.Unlock() var runCtx context.Context runCtx, s.cancelRun = context.WithCancel(context.Background()) + username, workingDir, err := s.executor.GetJobInfo(runCtx) go func() { _ = s.executor.Run(runCtx) // INFO: all errors are handled inside the Run() s.jobBarrierCh <- nil // notify server that job finished }() - s.executor.SetRunnerState(executor.ServeLogs) + + if err == nil { + return &schemas.JobInfoResponse{ + Username: username, + WorkingDir: workingDir, + }, nil + } return nil, nil } diff --git a/runner/internal/schemas/schemas.go b/runner/internal/schemas/schemas.go index 152637decc..10ab62ea95 100644 --- a/runner/internal/schemas/schemas.go +++ b/runner/internal/schemas/schemas.go @@ -35,7 +35,11 @@ type PullResponse struct { LastUpdated int64 `json:"last_updated"` NoConnectionsSecs int64 `json:"no_connections_secs"` HasMore bool `json:"has_more"` - // todo Result +} + +type JobInfoResponse struct { + WorkingDir string `json:"working_dir"` + Username string `json:"username"` } type Run struct { diff --git a/src/dstack/_internal/core/models/runs.py b/src/dstack/_internal/core/models/runs.py index 88c2f38f5e..558b07e26e 100644 --- a/src/dstack/_internal/core/models/runs.py +++ b/src/dstack/_internal/core/models/runs.py @@ -352,6 +352,10 @@ class JobRuntimeData(CoreModel): volume_names: Optional[list[str]] = None # None for backward compatibility # Virtual shared offer offer: Optional[InstanceOfferWithAvailability] = None # None for backward compatibility + # Resolved working directory and OS username reported by the runner. + # None if the runner hasn't reported them yet or if it's an old runner. + working_dir: Optional[str] = None + username: Optional[str] = None class ClusterInfo(CoreModel): diff --git a/src/dstack/_internal/server/background/scheduled_tasks/running_jobs.py b/src/dstack/_internal/server/background/scheduled_tasks/running_jobs.py index 9d3bd04c3b..5916c9054a 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/running_jobs.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/running_jobs.py @@ -1116,7 +1116,13 @@ def _submit_job_to_runner( logger.debug("%s: uploading code", fmt(job_model)) runner_client.upload_code(code) logger.debug("%s: starting job", fmt(job_model)) - runner_client.run_job() + job_info = runner_client.run_job() + if job_info is not None: + jrd = get_job_runtime_data(job_model) + if jrd is not None: + jrd.working_dir = job_info.working_dir + jrd.username = job_info.username + job_model.job_runtime_data = jrd.json() switch_job_status(session, job_model, JobStatus.RUNNING) # do not log here, because the runner will send a new status diff --git a/src/dstack/_internal/server/schemas/runner.py b/src/dstack/_internal/server/schemas/runner.py index 12ff6c6825..89649ddda6 100644 --- a/src/dstack/_internal/server/schemas/runner.py +++ b/src/dstack/_internal/server/schemas/runner.py @@ -46,6 +46,11 @@ class PullResponse(CoreModel): no_connections_secs: Optional[int] = None # Optional for compatibility with old runners +class JobInfoResponse(CoreModel): + working_dir: str + username: str + + class SubmitBody(CoreModel): run: Annotated[ Run, diff --git a/src/dstack/_internal/server/services/runner/client.py b/src/dstack/_internal/server/services/runner/client.py index c83a42b744..c31726e76a 100644 --- a/src/dstack/_internal/server/services/runner/client.py +++ b/src/dstack/_internal/server/services/runner/client.py @@ -24,6 +24,7 @@ GPUDevice, HealthcheckResponse, InstanceHealthResponse, + JobInfoResponse, LegacyPullResponse, LegacyStopBody, LegacySubmitBody, @@ -124,9 +125,13 @@ def upload_code(self, file: Union[BinaryIO, bytes]): ) resp.raise_for_status() - def run_job(self): + def run_job(self) -> Optional[JobInfoResponse]: resp = requests.post(self._url("/api/run"), timeout=REQUEST_TIMEOUT) resp.raise_for_status() + if not _is_json_response(resp): + # Old runner or runner failed to get job info + return None + return JobInfoResponse.__response__.parse_obj(resp.json()) def pull(self, timestamp: int) -> PullResponse: resp = requests.get( @@ -617,6 +622,13 @@ def _memory_to_bytes(memory: Optional[Memory]) -> int: return int(memory * 1024**3) +def _is_json_response(response: requests.Response) -> bool: + content_type = response.headers.get("content-type") + if not content_type: + return False + return content_type.split(";", maxsplit=1)[0].strip() == "application/json" + + _TaskID = Union[uuid.UUID, str] _Version = tuple[int, int, int] diff --git a/src/dstack/_internal/server/testing/common.py b/src/dstack/_internal/server/testing/common.py index cca5212576..6bff65dea3 100644 --- a/src/dstack/_internal/server/testing/common.py +++ b/src/dstack/_internal/server/testing/common.py @@ -456,6 +456,8 @@ def get_job_runtime_data( ports: Optional[dict[int, int]] = None, offer: Optional[InstanceOfferWithAvailability] = None, volume_names: Optional[list[str]] = None, + working_dir: Optional[str] = None, + username: Optional[str] = None, ) -> JobRuntimeData: return JobRuntimeData( network_mode=NetworkMode(network_mode), @@ -465,6 +467,8 @@ def get_job_runtime_data( ports=ports, offer=offer, volume_names=volume_names, + working_dir=working_dir, + username=username, ) diff --git a/src/dstack/api/_public/runs.py b/src/dstack/api/_public/runs.py index 72d31189f2..675a88d292 100644 --- a/src/dstack/api/_public/runs.py +++ b/src/dstack/api/_public/runs.py @@ -352,9 +352,9 @@ def attach( if runtime_data is not None and runtime_data.ports is not None: container_ssh_port = runtime_data.ports.get(container_ssh_port, container_ssh_port) - # TODO: get login name from runner in case it's not specified in the run configuration - # (i.e. the default image user is used, and it is not root) - if job.job_spec.user is not None and job.job_spec.user.username is not None: + if runtime_data is not None and runtime_data.username is not None: + container_user = runtime_data.username + elif job.job_spec.user is not None and job.job_spec.user.username is not None: container_user = job.job_spec.user.username else: container_user = "root" diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_running_jobs.py b/src/tests/_internal/server/background/scheduled_tasks/test_running_jobs.py index 0d748f4e91..aad8615bf3 100644 --- a/src/tests/_internal/server/background/scheduled_tasks/test_running_jobs.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_running_jobs.py @@ -44,6 +44,7 @@ from dstack._internal.server.models import JobModel from dstack._internal.server.schemas.runner import ( HealthcheckResponse, + JobInfoResponse, JobStateEvent, PortMapping, PullResponse, @@ -188,6 +189,7 @@ async def test_runs_provisioning_job(self, test_db, session: AsyncSession): run=run, status=JobStatus.PROVISIONING, job_provisioning_data=job_provisioning_data, + job_runtime_data=get_job_runtime_data(), instance=instance, instance_assigned=True, ) @@ -201,6 +203,9 @@ async def test_runs_provisioning_job(self, test_db, session: AsyncSession): runner_client_mock.healthcheck.return_value = HealthcheckResponse( service="dstack-runner", version="0.0.1.dev2" ) + runner_client_mock.run_job.return_value = JobInfoResponse( + working_dir="/dstack/run", username="dstack" + ) await process_running_jobs() SSHTunnelMock.assert_called_once() runner_client_mock.healthcheck.assert_called_once() @@ -210,6 +215,9 @@ async def test_runs_provisioning_job(self, test_db, session: AsyncSession): await session.refresh(job) assert job is not None assert job.status == JobStatus.RUNNING + jrd = JobRuntimeData.__response__.parse_raw(job.job_runtime_data) + assert jrd.working_dir == "/dstack/run" + assert jrd.username == "dstack" @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @@ -416,6 +424,9 @@ async def test_pulling_shim( PortMapping(container=10022, host=32771), PortMapping(container=10999, host=32772), ] + runner_client_mock.run_job.return_value = JobInfoResponse( + working_dir="/dstack/run", username="dstack" + ) await process_running_jobs() @@ -428,10 +439,13 @@ async def test_pulling_shim( await session.refresh(job) assert job is not None assert job.status == JobStatus.RUNNING - assert JobRuntimeData.__response__.parse_raw(job.job_runtime_data).ports == { + jrd = JobRuntimeData.__response__.parse_raw(job.job_runtime_data) + assert jrd.ports == { 10022: 32771, 10999: 32772, } + assert jrd.working_dir == "/dstack/run" + assert jrd.username == "dstack" @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) From 165de644f6a4d0aa06ff658e64aa4991582431ed Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Mon, 2 Mar 2026 11:02:14 +0500 Subject: [PATCH 175/187] Implement fleet pipeline (#3623) * Load only fleets active runs in apply_plan * Replace fleets many-to-many joinedloads with selectinloads * Optimize selects * WIP: FleetPipeline * Fixes * Add TestFleetWorker * Fix consolidation_attempt reset * Use typed dicts for update maps * Unify processing result classes * Centralize last_processed_at setting * Refactor _build_instance_update_rows() * Make result naming consistent * Refactor _create_missing_fleet_instances() * Enable FleetPipeline * Respect fleet locks in the API endpoints * Add FleetModel pipeline migration * Adjust CONSOLIDATION_RETRY_DELAYS * Fix fleet autodelete comments * Add scheduled tasks deprecated note * Cleanup comment * Add ix_fleets_pipeline_fetch_q index --- .../background/pipeline_tasks/__init__.py | 2 + .../server/background/pipeline_tasks/base.py | 86 ++- .../pipeline_tasks/compute_groups.py | 89 +-- .../background/pipeline_tasks/fleets.py | 558 ++++++++++++++++++ .../background/pipeline_tasks/gateways.py | 66 ++- .../pipeline_tasks/placement_groups.py | 58 +- .../background/pipeline_tasks/volumes.py | 46 +- .../background/scheduled_tasks/__init__.py | 10 +- .../scheduled_tasks/compute_groups.py | 2 + .../background/scheduled_tasks/fleets.py | 27 +- .../background/scheduled_tasks/gateways.py | 2 + .../scheduled_tasks/placement_groups.py | 2 + .../scheduled_tasks/submitted_jobs.py | 13 +- .../background/scheduled_tasks/volumes.py | 2 + ...e61de27_add_fleetmodel_pipeline_columns.py | 47 ++ ...ec_add_ix_fleets_pipeline_fetch_q_index.py | 49 ++ src/dstack/_internal/server/models.py | 13 +- .../_internal/server/services/fleets.py | 107 +++- .../server/services/gateways/__init__.py | 2 +- .../_internal/server/services/volumes.py | 2 +- .../background/pipeline_tasks/test_fleets.py | 398 +++++++++++++ .../pipeline_tasks/test_gateways.py | 4 + .../pipeline_tasks/test_placement_groups.py | 39 ++ .../background/scheduled_tasks/test_fleets.py | 9 + .../_internal/server/routers/test_fleets.py | 63 ++ 25 files changed, 1555 insertions(+), 141 deletions(-) create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/fleets.py create mode 100644 src/dstack/_internal/server/migrations/versions/2026/02_27_1218_d21d3e61de27_add_fleetmodel_pipeline_columns.py create mode 100644 src/dstack/_internal/server/migrations/versions/2026/03_02_0530_46150101edec_add_ix_fleets_pipeline_fetch_q_index.py create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_fleets.py diff --git a/src/dstack/_internal/server/background/pipeline_tasks/__init__.py b/src/dstack/_internal/server/background/pipeline_tasks/__init__.py index d9f67680ce..6b3762419f 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/__init__.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/__init__.py @@ -2,6 +2,7 @@ from dstack._internal.server.background.pipeline_tasks.base import Pipeline from dstack._internal.server.background.pipeline_tasks.compute_groups import ComputeGroupPipeline +from dstack._internal.server.background.pipeline_tasks.fleets import FleetPipeline from dstack._internal.server.background.pipeline_tasks.gateways import GatewayPipeline from dstack._internal.server.background.pipeline_tasks.placement_groups import ( PlacementGroupPipeline, @@ -16,6 +17,7 @@ class PipelineManager: def __init__(self) -> None: self._pipelines: list[Pipeline] = [ ComputeGroupPipeline(), + FleetPipeline(), GatewayPipeline(), PlacementGroupPipeline(), VolumePipeline(), diff --git a/src/dstack/_internal/server/background/pipeline_tasks/base.py b/src/dstack/_internal/server/background/pipeline_tasks/base.py index 9d016934cb..aa5af9a4a3 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/base.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/base.py @@ -3,9 +3,20 @@ import random import uuid from abc import ABC, abstractmethod +from collections.abc import Iterable, Sequence from dataclasses import dataclass from datetime import datetime, timedelta -from typing import Any, ClassVar, Generic, Optional, Protocol, Sequence, TypeVar +from typing import ( + Any, + ClassVar, + Final, + Generic, + Optional, + Protocol, + TypedDict, + TypeVar, + Union, +) from sqlalchemy import and_, or_, update from sqlalchemy.orm import Mapped @@ -337,16 +348,71 @@ async def process(self, item: ItemT): pass -UpdateMap = dict[str, Any] +class _NowPlaceholder: + pass + + +NOW_PLACEHOLDER: Final = _NowPlaceholder() +""" +Use `NOW_PLACEHOLDER` together with `resolve_now_placeholders()` in pipeline update maps +instead of `get_current_time()` to have the same current time for all updates in the transaction. +""" + + +UpdateMapDateTime = Union[datetime, _NowPlaceholder] + + +class _UnlockUpdateMap(TypedDict, total=False): + lock_expires_at: Optional[datetime] + lock_token: Optional[uuid.UUID] + lock_owner: Optional[str] + + +class _ProcessedUpdateMap(TypedDict, total=False): + last_processed_at: UpdateMapDateTime + +class ItemUpdateMap(_UnlockUpdateMap, _ProcessedUpdateMap, total=False): + lock_expires_at: Optional[datetime] + lock_token: Optional[uuid.UUID] + lock_owner: Optional[str] + last_processed_at: UpdateMapDateTime -def get_unlock_update_map() -> UpdateMap: - return { - "lock_expires_at": None, - "lock_token": None, - "lock_owner": None, - } +def set_unlock_update_map_fields(update_map: _UnlockUpdateMap): + update_map["lock_expires_at"] = None + update_map["lock_token"] = None + update_map["lock_owner"] = None -def get_processed_update_map() -> UpdateMap: - return {"last_processed_at": get_current_datetime()} + +def set_processed_update_map_fields( + update_map: _ProcessedUpdateMap, + now: UpdateMapDateTime = NOW_PLACEHOLDER, +): + update_map["last_processed_at"] = now + + +class _ResolveNowUpdateMap(Protocol): + def items(self) -> Iterable[tuple[str, object]]: ... + + +_ResolveNowInput = Union[_ResolveNowUpdateMap, Sequence[_ResolveNowUpdateMap]] + + +def resolve_now_placeholders(update_values: _ResolveNowInput, now: datetime): + """ + Replaces `NOW_PLACEHOLDER` with `now` in an update map or a sequence of update rows. + """ + if isinstance(update_values, Sequence): + for update_row in update_values: + resolve_now_placeholders(update_row, now) + return + # Runtime dict narrowing is required here: pyright doesn't model TypedDicts as + # supporting generic dynamic-key mutation via protocol methods. + if not isinstance(update_values, dict): + raise TypeError( + "resolve_now_placeholders() expects update maps or sequences of update maps" + ) + for key, value in update_values.items(): + if value is NOW_PLACEHOLDER: + update_values[key] = now diff --git a/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py b/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py index 33e839b8b6..0ee2975eb2 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py @@ -2,7 +2,7 @@ import uuid from dataclasses import dataclass, field from datetime import datetime, timedelta -from typing import Sequence +from typing import Sequence, TypedDict from sqlalchemy import or_, select, update from sqlalchemy.orm import joinedload, load_only @@ -12,14 +12,17 @@ from dstack._internal.core.models.compute_groups import ComputeGroupStatus from dstack._internal.core.models.instances import InstanceStatus from dstack._internal.server.background.pipeline_tasks.base import ( + NOW_PLACEHOLDER, Fetcher, Heartbeater, + ItemUpdateMap, Pipeline, PipelineItem, - UpdateMap, + UpdateMapDateTime, Worker, - get_processed_update_map, - get_unlock_update_map, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, ) from dstack._internal.server.db import get_db, get_session_ctx from dstack._internal.server.models import ComputeGroupModel, InstanceModel, ProjectModel @@ -199,25 +202,28 @@ async def process(self, item: PipelineItem): ) return - terminate_result = _TerminateResult() + result = _TerminateResult() # TODO: Fetch only compute groups with all instances terminating. if all(i.status == InstanceStatus.TERMINATING for i in compute_group_model.instances): - terminate_result = await _terminate_compute_group(compute_group_model) - if terminate_result.compute_group_update_map: + result = await _terminate_compute_group(compute_group_model) + set_processed_update_map_fields(result.compute_group_update_map) + if result.instances_update_map: + set_processed_update_map_fields(result.instances_update_map) + set_unlock_update_map_fields(result.compute_group_update_map) + if result.compute_group_update_map.get("deleted", False): logger.info("Terminated compute group %s", compute_group_model.id) - else: - terminate_result.compute_group_update_map = get_processed_update_map() - - terminate_result.compute_group_update_map |= get_unlock_update_map() async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(result.compute_group_update_map, now=now) + resolve_now_placeholders(result.instances_update_map, now=now) res = await session.execute( update(ComputeGroupModel) .where( ComputeGroupModel.id == compute_group_model.id, ComputeGroupModel.lock_token == compute_group_model.lock_token, ) - .values(**terminate_result.compute_group_update_map) + .values(**result.compute_group_update_map) .returning(ComputeGroupModel.id) ) updated_ids = list(res.scalars().all()) @@ -229,13 +235,13 @@ async def process(self, item: PipelineItem): item.id, ) return - if not terminate_result.instances_update_map: + if not result.instances_update_map: return instances_ids = [i.id for i in compute_group_model.instances] res = await session.execute( update(InstanceModel) .where(InstanceModel.id.in_(instances_ids)) - .values(**terminate_result.instances_update_map) + .values(**result.instances_update_map) ) for instance_model in compute_group_model.instances: emit_instance_status_change_event( @@ -246,10 +252,28 @@ async def process(self, item: PipelineItem): ) +class _ComputeGroupUpdateMap(ItemUpdateMap, total=False): + status: ComputeGroupStatus + deleted: bool + deleted_at: UpdateMapDateTime + first_termination_retry_at: UpdateMapDateTime + last_termination_retry_at: UpdateMapDateTime + + +class _InstanceBulkUpdateMap(TypedDict, total=False): + last_processed_at: UpdateMapDateTime + deleted: bool + deleted_at: UpdateMapDateTime + finished_at: UpdateMapDateTime + status: InstanceStatus + + @dataclass class _TerminateResult: - compute_group_update_map: UpdateMap = field(default_factory=dict) - instances_update_map: UpdateMap = field(default_factory=dict) + compute_group_update_map: _ComputeGroupUpdateMap = field( + default_factory=_ComputeGroupUpdateMap + ) + instances_update_map: _InstanceBulkUpdateMap = field(default_factory=_InstanceBulkUpdateMap) async def _terminate_compute_group(compute_group_model: ComputeGroupModel) -> _TerminateResult: @@ -283,15 +307,15 @@ async def _terminate_compute_group(compute_group_model: ComputeGroupModel) -> _T compute_group, ) except Exception as e: + retry_at = get_current_datetime() + first_termination_retry_at = compute_group_model.first_termination_retry_at if compute_group_model.first_termination_retry_at is None: - result.compute_group_update_map["first_termination_retry_at"] = get_current_datetime() - result.compute_group_update_map["last_termination_retry_at"] = get_current_datetime() - if _next_termination_retry_at( - result.compute_group_update_map["last_termination_retry_at"] - ) < _get_termination_deadline( - result.compute_group_update_map.get( - "first_termination_retry_at", compute_group_model.first_termination_retry_at - ) + result.compute_group_update_map["first_termination_retry_at"] = NOW_PLACEHOLDER + first_termination_retry_at = retry_at + assert first_termination_retry_at is not None + result.compute_group_update_map["last_termination_retry_at"] = NOW_PLACEHOLDER + if _next_termination_retry_at(retry_at) < _get_termination_deadline( + first_termination_retry_at ): logger.warning( "Failed to terminate compute group %s. Will retry. Error: %r", @@ -309,11 +333,9 @@ async def _terminate_compute_group(compute_group_model: ComputeGroupModel) -> _T exc_info=not isinstance(e, BackendError), ) terminated_result = _get_terminated_result() - return _TerminateResult( - compute_group_update_map=result.compute_group_update_map - | terminated_result.compute_group_update_map, - instances_update_map=result.instances_update_map | terminated_result.instances_update_map, - ) + terminated_result.compute_group_update_map.update(result.compute_group_update_map) + terminated_result.instances_update_map.update(result.instances_update_map) + return terminated_result def _next_termination_retry_at(last_termination_retry_at: datetime) -> datetime: @@ -325,19 +347,16 @@ def _get_termination_deadline(first_termination_retry_at: datetime) -> datetime: def _get_terminated_result() -> _TerminateResult: - now = get_current_datetime() return _TerminateResult( compute_group_update_map={ - "last_processed_at": now, "deleted": True, - "deleted_at": now, + "deleted_at": NOW_PLACEHOLDER, "status": ComputeGroupStatus.TERMINATED, }, instances_update_map={ - "last_processed_at": now, "deleted": True, - "deleted_at": now, - "finished_at": now, + "deleted_at": NOW_PLACEHOLDER, + "finished_at": NOW_PLACEHOLDER, "status": InstanceStatus.TERMINATED, }, ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/fleets.py b/src/dstack/_internal/server/background/pipeline_tasks/fleets.py new file mode 100644 index 0000000000..55ffcd7f94 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/fleets.py @@ -0,0 +1,558 @@ +import asyncio +import uuid +from dataclasses import dataclass, field +from datetime import timedelta +from typing import Sequence, TypedDict + +from sqlalchemy import or_, select, update +from sqlalchemy.ext.asyncio.session import AsyncSession +from sqlalchemy.orm import joinedload, load_only, selectinload + +from dstack._internal.core.models.fleets import FleetSpec, FleetStatus +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.core.models.runs import RunStatus +from dstack._internal.server.background.pipeline_tasks.base import ( + NOW_PLACEHOLDER, + Fetcher, + Heartbeater, + ItemUpdateMap, + Pipeline, + PipelineItem, + UpdateMapDateTime, + Worker, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + FleetModel, + InstanceModel, + JobModel, + PlacementGroupModel, + RunModel, +) +from dstack._internal.server.services import events +from dstack._internal.server.services.fleets import ( + create_fleet_instance_model, + emit_fleet_status_change_event, + get_fleet_spec, + get_next_instance_num, + is_fleet_empty, + is_fleet_in_use, +) +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +class FleetPipeline(Pipeline[PipelineItem]): + def __init__( + self, + workers_num: int = 10, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=60), + lock_timeout: timedelta = timedelta(seconds=20), + heartbeat_trigger: timedelta = timedelta(seconds=10), + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[PipelineItem]( + model_type=FleetModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = FleetFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + FleetWorker(queue=self._queue, heartbeater=self._heartbeater) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return FleetModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[PipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[PipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["FleetWorker"]: + return self.__workers + + +class FleetFetcher(Fetcher[PipelineItem]): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[PipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_named_task("pipeline_tasks.FleetFetcher.fetch") + async def fetch(self, limit: int) -> list[PipelineItem]: + fleet_lock, _ = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__) + async with fleet_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(FleetModel) + .where( + FleetModel.deleted == False, + or_( + FleetModel.last_processed_at <= now - self._min_processing_interval, + FleetModel.last_processed_at == FleetModel.created_at, + ), + or_( + FleetModel.lock_expires_at.is_(None), + FleetModel.lock_expires_at < now, + ), + or_( + FleetModel.lock_owner.is_(None), + FleetModel.lock_owner == FleetPipeline.__name__, + ), + ) + .order_by(FleetModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True) + .options( + load_only( + FleetModel.id, + FleetModel.lock_token, + FleetModel.lock_expires_at, + ) + ) + ) + fleet_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for fleet_model in fleet_models: + prev_lock_expired = fleet_model.lock_expires_at is not None + fleet_model.lock_expires_at = lock_expires_at + fleet_model.lock_token = lock_token + fleet_model.lock_owner = FleetPipeline.__name__ + items.append( + PipelineItem( + __tablename__=FleetModel.__tablename__, + id=fleet_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + ) + ) + await session.commit() + return items + + +class FleetWorker(Worker[PipelineItem]): + def __init__( + self, + queue: asyncio.Queue[PipelineItem], + heartbeater: Heartbeater[PipelineItem], + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + ) + + @sentry_utils.instrument_named_task("pipeline_tasks.FleetWorker.process") + async def process(self, item: PipelineItem): + async with get_session_ctx() as session: + res = await session.execute( + select(FleetModel) + .where( + FleetModel.id == item.id, + FleetModel.lock_token == item.lock_token, + ) + .options(joinedload(FleetModel.project)) + .options( + selectinload(FleetModel.instances.and_(InstanceModel.deleted == False)) + .joinedload(InstanceModel.jobs) + .load_only(JobModel.id), + ) + .options( + selectinload( + FleetModel.runs.and_(RunModel.status.not_in(RunStatus.finished_statuses())) + ).load_only(RunModel.status) + ) + ) + fleet_model = res.unique().scalar_one_or_none() + if fleet_model is None: + logger.warning( + "Failed to process %s item %s: lock_token mismatch." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + return + + instance_lock, _ = get_locker(get_db().dialect_name).get_lockset( + InstanceModel.__tablename__ + ) + async with instance_lock: + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.fleet_id == item.id, + InstanceModel.deleted == False, + # TODO: Lock instance models in the DB + # or_( + # InstanceModel.lock_expires_at.is_(None), + # InstanceModel.lock_expires_at < get_current_datetime(), + # ), + # or_( + # InstanceModel.lock_owner.is_(None), + # InstanceModel.lock_owner == FleetPipeline.__name__, + # ), + ) + .with_for_update(skip_locked=True, key_share=True) + ) + locked_instance_models = res.scalars().all() + if len(fleet_model.instances) != len(locked_instance_models): + logger.debug( + "Failed to lock fleet %s instances. The fleet will be processed later.", + item.id, + ) + now = get_current_datetime() + # Keep `lock_owner` so that `InstancePipeline` sees that the fleet is being locked + # but unset `lock_expires_at` to process the item again ASAP (after `min_processing_interval`). + # Unset `lock_token` so that heartbeater can no longer update the item. + res = await session.execute( + update(FleetModel) + .where( + FleetModel.id == item.id, + FleetModel.lock_token == item.lock_token, + ) + .values( + lock_expires_at=None, + lock_token=None, + last_processed_at=now, + ) + ) + if res.rowcount == 0: # pyright: ignore[reportAttributeAccessIssue] + logger.warning( + "Failed to reset lock: lock_token changed." + " The item is expected to be processed and updated on another fetch iteration." + ) + return + + # TODO: Lock instance models in the DB + # for instance_model in locked_instance_models: + # instance_model.lock_expires_at = item.lock_expires_at + # instance_model.lock_token = item.lock_token + # instance_model.lock_owner = FleetPipeline.__name__ + # await session.commit() + + result = await _process_fleet(fleet_model) + fleet_update_map = _FleetUpdateMap() + fleet_update_map.update(result.fleet_update_map) + set_processed_update_map_fields(fleet_update_map) + set_unlock_update_map_fields(fleet_update_map) + instance_update_rows = _build_instance_update_rows(result.instance_id_to_update_map) + + async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(fleet_update_map, now=now) + resolve_now_placeholders(instance_update_rows, now=now) + res = await session.execute( + update(FleetModel) + .where( + FleetModel.id == fleet_model.id, + FleetModel.lock_token == fleet_model.lock_token, + ) + .values(**fleet_update_map) + .returning(FleetModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + logger.warning( + "Failed to update %s item %s after processing: lock_token changed." + " The item is expected to be processed and updated on another fetch iteration.", + item.__tablename__, + item.id, + ) + # TODO: Clean up fleet. + return + + if fleet_update_map.get("deleted"): + await session.execute( + update(PlacementGroupModel) + .where(PlacementGroupModel.fleet_id == item.id) + .values(fleet_deleted=True) + ) + if instance_update_rows: + await session.execute( + update(InstanceModel).execution_options(synchronize_session=False), + instance_update_rows, + ) + if result.new_instances_count > 0: + await _create_missing_fleet_instances( + session=session, + fleet_model=fleet_model, + new_instances_count=result.new_instances_count, + ) + emit_fleet_status_change_event( + session=session, + fleet_model=fleet_model, + old_status=fleet_model.status, + new_status=fleet_update_map.get("status", fleet_model.status), + status_message=fleet_update_map.get("status_message", fleet_model.status_message), + ) + + +class _FleetUpdateMap(ItemUpdateMap, total=False): + status: FleetStatus + status_message: str + deleted: bool + deleted_at: UpdateMapDateTime + consolidation_attempt: int + last_consolidated_at: UpdateMapDateTime + + +class _InstanceUpdateMap(TypedDict, total=False): + status: InstanceStatus + termination_reason: InstanceTerminationReason + termination_reason_message: str + deleted: bool + deleted_at: UpdateMapDateTime + last_processed_at: UpdateMapDateTime + id: uuid.UUID + + +@dataclass +class _ProcessResult: + fleet_update_map: _FleetUpdateMap = field(default_factory=_FleetUpdateMap) + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap] = field(default_factory=dict) + new_instances_count: int = 0 + + +@dataclass +class _MaintainNodesResult: + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap] = field(default_factory=dict) + new_instances_count: int = 0 + changes_required: bool = False + + @property + def has_changes(self) -> bool: + return len(self.instance_id_to_update_map) > 0 or self.new_instances_count > 0 + + +async def _process_fleet(fleet_model: FleetModel) -> _ProcessResult: + result = _consolidate_fleet_state_with_spec(fleet_model) + if result.new_instances_count > 0: + # Avoid deleting fleets that are about to provision new instances. + return result + delete = _should_delete_fleet(fleet_model) + if delete: + result.fleet_update_map["status"] = FleetStatus.TERMINATED + result.fleet_update_map["deleted"] = True + result.fleet_update_map["deleted_at"] = NOW_PLACEHOLDER + return result + + +def _consolidate_fleet_state_with_spec(fleet_model: FleetModel) -> _ProcessResult: + result = _ProcessResult() + if fleet_model.status == FleetStatus.TERMINATING: + return result + fleet_spec = get_fleet_spec(fleet_model) + if fleet_spec.configuration.nodes is None or fleet_spec.autocreated: + # Only explicitly created cloud fleets are consolidated. + return result + if not _is_fleet_ready_for_consolidation(fleet_model): + return result + maintain_nodes_result = _maintain_fleet_nodes_in_min_max_range(fleet_model, fleet_spec) + if maintain_nodes_result.has_changes: + result.instance_id_to_update_map = maintain_nodes_result.instance_id_to_update_map + result.new_instances_count = maintain_nodes_result.new_instances_count + if maintain_nodes_result.changes_required: + result.fleet_update_map["consolidation_attempt"] = fleet_model.consolidation_attempt + 1 + else: + # The fleet is consolidated with respect to nodes min/max. + result.fleet_update_map["consolidation_attempt"] = 0 + result.fleet_update_map["last_consolidated_at"] = NOW_PLACEHOLDER + return result + + +def _is_fleet_ready_for_consolidation(fleet_model: FleetModel) -> bool: + consolidation_retry_delay = _get_consolidation_retry_delay(fleet_model.consolidation_attempt) + last_consolidated_at = fleet_model.last_consolidated_at or fleet_model.last_processed_at + duration_since_last_consolidation = get_current_datetime() - last_consolidated_at + return duration_since_last_consolidation >= consolidation_retry_delay + + +# We use exponentially increasing consolidation retry delays so that +# consolidation does not happen too often. In particular, this prevents +# retrying instance provisioning constantly in case of no offers. +_CONSOLIDATION_RETRY_DELAYS = [ + timedelta(minutes=1), + timedelta(minutes=2), + timedelta(minutes=5), + timedelta(minutes=10), + timedelta(minutes=30), +] + + +def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta: + if consolidation_attempt < len(_CONSOLIDATION_RETRY_DELAYS): + return _CONSOLIDATION_RETRY_DELAYS[consolidation_attempt] + return _CONSOLIDATION_RETRY_DELAYS[-1] + + +def _maintain_fleet_nodes_in_min_max_range( + fleet_model: FleetModel, + fleet_spec: FleetSpec, +) -> _MaintainNodesResult: + """ + Ensures the fleet has at least `nodes.min` and at most `nodes.max` instances. + """ + assert fleet_spec.configuration.nodes is not None + result = _MaintainNodesResult() + for instance in fleet_model.instances: + # Delete terminated but not deleted instances since + # they are going to be replaced with new pending instances. + if instance.status == InstanceStatus.TERMINATED and not instance.deleted: + result.changes_required = True + result.instance_id_to_update_map[instance.id] = { + "deleted": True, + "deleted_at": NOW_PLACEHOLDER, + } + active_instances = [ + i for i in fleet_model.instances if i.status != InstanceStatus.TERMINATED and not i.deleted + ] + active_instances_num = len(active_instances) + if active_instances_num < fleet_spec.configuration.nodes.min: + result.changes_required = True + nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num + result.new_instances_count = nodes_missing + return result + if ( + fleet_spec.configuration.nodes.max is None + or active_instances_num <= fleet_spec.configuration.nodes.max + ): + return result + # Fleet has more instances than allowed by nodes.max. + # This is possible due to race conditions (e.g. provisioning jobs in a fleet concurrently) + # or if nodes.max is updated. + result.changes_required = True + nodes_redundant = active_instances_num - fleet_spec.configuration.nodes.max + for instance in fleet_model.instances: + if nodes_redundant == 0: + break + if instance.status == InstanceStatus.IDLE: + result.instance_id_to_update_map[instance.id] = { + "termination_reason": InstanceTerminationReason.MAX_INSTANCES_LIMIT, + "termination_reason_message": "Fleet has too many instances", + "status": InstanceStatus.TERMINATING, + } + nodes_redundant -= 1 + return result + + +def _should_delete_fleet(fleet_model: FleetModel) -> bool: + if fleet_model.project.deleted: + # It used to be possible to delete project with active resources: + # https://github.com/dstackai/dstack/issues/3077 + logger.info("Fleet %s deleted due to deleted project", fleet_model.name) + return True + + if is_fleet_in_use(fleet_model) or not is_fleet_empty(fleet_model): + return False + + # TODO: Drop non-terminating fleets auto-deletion after dropping fleets auto-creation. + fleet_spec = get_fleet_spec(fleet_model) + if ( + fleet_model.status != FleetStatus.TERMINATING + and fleet_spec.configuration.nodes is not None + and fleet_spec.configuration.nodes.min == 0 + ): + # Empty fleets that allow 0 nodes should not be auto-deleted + return False + + logger.info("Automatic cleanup of an empty fleet %s", fleet_model.name) + return True + + +def _build_instance_update_rows( + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], +) -> list[_InstanceUpdateMap]: + instance_update_rows = [] + for instance_id, instance_update_map in instance_id_to_update_map.items(): + update_row = _InstanceUpdateMap() + update_row.update(instance_update_map) + update_row["id"] = instance_id + set_processed_update_map_fields(update_row) + instance_update_rows.append(update_row) + return instance_update_rows + + +async def _create_missing_fleet_instances( + session: AsyncSession, + fleet_model: FleetModel, + new_instances_count: int, +): + fleet_spec = get_fleet_spec(fleet_model) + res = await session.execute( + select(InstanceModel.instance_num).where( + InstanceModel.fleet_id == fleet_model.id, + InstanceModel.deleted == False, + ) + ) + taken_instance_nums = set(res.scalars().all()) + for _ in range(new_instances_count): + instance_num = get_next_instance_num(taken_instance_nums) + instance_model = create_fleet_instance_model( + session=session, + project=fleet_model.project, + # TODO: Store fleet.user and pass it instead of the project owner. + username=fleet_model.project.owner.name, + spec=fleet_spec, + instance_num=instance_num, + ) + instance_model.fleet_id = fleet_model.id + taken_instance_nums.add(instance_num) + events.emit( + session=session, + message=( + "Instance created to meet target fleet node count." + f" Status: {instance_model.status.upper()}" + ), + actor=events.SystemActor(), + targets=[events.Target.from_model(instance_model)], + ) + logger.info( + "Added %d instances to fleet %s", + new_instances_count, + fleet_model.name, + ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/gateways.py b/src/dstack/_internal/server/background/pipeline_tasks/gateways.py index cdd0904e1a..2d5f0a947b 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/gateways.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/gateways.py @@ -2,7 +2,7 @@ import uuid from dataclasses import dataclass, field from datetime import timedelta -from typing import Optional, Sequence +from typing import Optional, Sequence, TypedDict from sqlalchemy import delete, or_, select, update from sqlalchemy.orm import joinedload, load_only @@ -14,12 +14,13 @@ from dstack._internal.server.background.pipeline_tasks.base import ( Fetcher, Heartbeater, + ItemUpdateMap, Pipeline, PipelineItem, - UpdateMap, Worker, - get_processed_update_map, - get_unlock_update_map, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, ) from dstack._internal.server.db import get_db, get_session_ctx from dstack._internal.server.models import ( @@ -227,13 +228,18 @@ async def _process_submitted_item(item: GatewayPipelineItem): return result = await _process_submitted_gateway(gateway_model) - update_map = result.update_map | get_processed_update_map() | get_unlock_update_map() + update_map = _GatewayUpdateMap() + update_map.update(result.update_map) + set_processed_update_map_fields(update_map) + set_unlock_update_map_fields(update_map) async with get_session_ctx() as session: gateway_compute_model = result.gateway_compute_model if gateway_compute_model is not None: session.add(gateway_compute_model) await session.flush() update_map["gateway_compute_id"] = gateway_compute_model.id + now = get_current_datetime() + resolve_now_placeholders(update_map, now=now) res = await session.execute( update(GatewayModel) .where( @@ -262,9 +268,20 @@ async def _process_submitted_item(item: GatewayPipelineItem): ) +class _GatewayUpdateMap(ItemUpdateMap, total=False): + status: GatewayStatus + status_message: str + gateway_compute_id: uuid.UUID + + +class _GatewayComputeUpdateMap(TypedDict, total=False): + active: bool + deleted: bool + + @dataclass class _SubmittedResult: - update_map: UpdateMap = field(default_factory=dict) + update_map: _GatewayUpdateMap = field(default_factory=_GatewayUpdateMap) gateway_compute_model: Optional[GatewayComputeModel] = None @@ -337,15 +354,20 @@ async def _process_provisioning_item(item: GatewayPipelineItem): return result = await _process_provisioning_gateway(gateway_model) - update_map = result.gateway_update_map | get_processed_update_map() | get_unlock_update_map() + gateway_update_map = result.gateway_update_map + set_processed_update_map_fields(gateway_update_map) + set_unlock_update_map_fields(gateway_update_map) + async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(gateway_update_map, now=now) res = await session.execute( update(GatewayModel) .where( GatewayModel.id == gateway_model.id, GatewayModel.lock_token == gateway_model.lock_token, ) - .values(**update_map) + .values(**gateway_update_map) .returning(GatewayModel.id) ) updated_ids = list(res.scalars().all()) @@ -361,8 +383,8 @@ async def _process_provisioning_item(item: GatewayPipelineItem): session=session, gateway_model=gateway_model, old_status=gateway_model.status, - new_status=update_map.get("status", gateway_model.status), - status_message=update_map.get("status_message", gateway_model.status_message), + new_status=gateway_update_map.get("status", gateway_model.status), + status_message=gateway_update_map.get("status_message", gateway_model.status_message), ) if result.gateway_compute_update_map: res = await session.execute( @@ -383,8 +405,10 @@ async def _process_provisioning_item(item: GatewayPipelineItem): @dataclass class _ProvisioningResult: - gateway_update_map: UpdateMap = field(default_factory=dict) - gateway_compute_update_map: UpdateMap = field(default_factory=dict) + gateway_update_map: _GatewayUpdateMap = field(default_factory=_GatewayUpdateMap) + gateway_compute_update_map: _GatewayComputeUpdateMap = field( + default_factory=_GatewayComputeUpdateMap + ) async def _process_provisioning_gateway(gateway_model: GatewayModel) -> _ProvisioningResult: @@ -475,13 +499,17 @@ async def _process_to_be_deleted_item(item: GatewayPipelineItem): targets=[events.Target.from_model(gateway_model)], ) else: + update_map = _GatewayUpdateMap() + set_processed_update_map_fields(update_map) + set_unlock_update_map_fields(update_map) + resolve_now_placeholders(update_map, now=get_current_datetime()) res = await session.execute( update(GatewayModel) .where( GatewayModel.id == gateway_model.id, GatewayModel.lock_token == gateway_model.lock_token, ) - .values(**get_processed_update_map()) + .values(**update_map) .returning(GatewayModel.id) ) updated_ids = list(res.scalars().all()) @@ -513,12 +541,14 @@ async def _process_to_be_deleted_item(item: GatewayPipelineItem): @dataclass -class _DeletedResult: +class _ProcessToBeDeletedResult: delete_gateway: bool - gateway_compute_update_map: UpdateMap = field(default_factory=dict) + gateway_compute_update_map: _GatewayComputeUpdateMap = field( + default_factory=_GatewayComputeUpdateMap + ) -async def _process_to_be_deleted_gateway(gateway_model: GatewayModel) -> _DeletedResult: +async def _process_to_be_deleted_gateway(gateway_model: GatewayModel) -> _ProcessToBeDeletedResult: assert gateway_model.backend.type != BackendType.DSTACK backend = await backends_services.get_project_backend_by_type_or_error( project=gateway_model.project, backend_type=gateway_model.backend.type @@ -542,9 +572,9 @@ async def _process_to_be_deleted_gateway(gateway_model: GatewayModel) -> _Delete "Error when deleting gateway compute for %s", gateway_model.name, ) - return _DeletedResult(delete_gateway=False) + return _ProcessToBeDeletedResult(delete_gateway=False) logger.info("Deleted gateway compute for %s", gateway_model.name) - result = _DeletedResult(delete_gateway=True) + result = _ProcessToBeDeletedResult(delete_gateway=True) if gateway_model.gateway_compute is not None: await gateway_connections_pool.remove(gateway_model.gateway_compute.ip_address) result.gateway_compute_update_map = {"active": False, "deleted": True} diff --git a/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py b/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py index 193358ec0f..703cfe1548 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py @@ -1,5 +1,6 @@ import asyncio import uuid +from dataclasses import dataclass, field from datetime import timedelta from typing import Sequence @@ -9,14 +10,17 @@ from dstack._internal.core.backends.base.compute import ComputeWithPlacementGroupSupport from dstack._internal.core.errors import PlacementGroupInUseError from dstack._internal.server.background.pipeline_tasks.base import ( + NOW_PLACEHOLDER, Fetcher, Heartbeater, + ItemUpdateMap, Pipeline, PipelineItem, - UpdateMap, + UpdateMapDateTime, Worker, - get_processed_update_map, - get_unlock_update_map, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, ) from dstack._internal.server.db import get_db, get_session_ctx from dstack._internal.server.models import ( @@ -193,15 +197,15 @@ async def process(self, item: PipelineItem): ) return - update_map = await _delete_placement_group(placement_group_model) - if update_map: + result = await _delete_placement_group(placement_group_model) + update_map = result.update_map + set_processed_update_map_fields(update_map) + set_unlock_update_map_fields(update_map) + if update_map.get("deleted", False): logger.info("Deleted placement group %s", placement_group_model.name) - else: - update_map = get_processed_update_map() - - update_map |= get_unlock_update_map() async with get_session_ctx() as session: + resolve_now_placeholders(update_map, now=get_current_datetime()) res = await session.execute( update(PlacementGroupModel) .where( @@ -221,13 +225,25 @@ async def process(self, item: PipelineItem): ) -async def _delete_placement_group(placement_group_model: PlacementGroupModel) -> UpdateMap: +class _PlacementGroupUpdateMap(ItemUpdateMap, total=False): + deleted: bool + deleted_at: UpdateMapDateTime + + +@dataclass +class _DeleteResult: + update_map: _PlacementGroupUpdateMap = field(default_factory=_PlacementGroupUpdateMap) + + +async def _delete_placement_group( + placement_group_model: PlacementGroupModel, +) -> _DeleteResult: placement_group = placement_group_model_to_placement_group(placement_group_model) if placement_group.provisioning_data is None: logger.error( "Failed to delete placement group %s. provisioning_data is None.", placement_group.name ) - return _get_deleted_update_map() + return _get_deleted_result() backend = await backends_services.get_project_backend_by_type( project=placement_group_model.project, backend_type=placement_group.provisioning_data.backend, @@ -238,7 +254,7 @@ async def _delete_placement_group(placement_group_model: PlacementGroupModel) -> "Failed to delete placement group %s. Backend not available. Please delete it manually.", placement_group.name, ) - return _get_deleted_update_map() + return _get_deleted_result() compute = backend.compute() assert isinstance(compute, ComputeWithPlacementGroupSupport) try: @@ -247,22 +263,18 @@ async def _delete_placement_group(placement_group_model: PlacementGroupModel) -> logger.info( "Placement group %s is still in use. Skipping deletion for now.", placement_group.name ) - return {} + return _DeleteResult() except Exception: # TODO: Retry deletion logger.exception( "Got exception when deleting placement group %s. Please delete it manually.", placement_group.name, ) - return _get_deleted_update_map() - - return _get_deleted_update_map() + return _get_deleted_result() -def _get_deleted_update_map() -> UpdateMap: - now = get_current_datetime() - return { - "last_processed_at": now, - "deleted": True, - "deleted_at": now, - } +def _get_deleted_result() -> _DeleteResult: + update_map = _PlacementGroupUpdateMap() + update_map["deleted"] = True + update_map["deleted_at"] = NOW_PLACEHOLDER + return _DeleteResult(update_map=update_map) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/volumes.py b/src/dstack/_internal/server/background/pipeline_tasks/volumes.py index 578fe8423b..c7a8f5761a 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/volumes.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/volumes.py @@ -11,14 +11,17 @@ from dstack._internal.core.errors import BackendError, BackendNotAvailable from dstack._internal.core.models.volumes import VolumeStatus from dstack._internal.server.background.pipeline_tasks.base import ( + NOW_PLACEHOLDER, Fetcher, Heartbeater, + ItemUpdateMap, Pipeline, PipelineItem, - UpdateMap, + UpdateMapDateTime, Worker, - get_processed_update_map, - get_unlock_update_map, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, ) from dstack._internal.server.db import get_db, get_session_ctx from dstack._internal.server.models import ( @@ -233,8 +236,12 @@ async def _process_submitted_item(item: VolumePipelineItem): return result = await _process_submitted_volume(volume_model) - update_map = result.update_map | get_processed_update_map() | get_unlock_update_map() + update_map = result.update_map + set_processed_update_map_fields(update_map) + set_unlock_update_map_fields(update_map) + async with get_session_ctx() as session: + resolve_now_placeholders(update_map, now=get_current_datetime()) res = await session.execute( update(VolumeModel) .where( @@ -263,9 +270,17 @@ async def _process_submitted_item(item: VolumePipelineItem): ) +class _VolumeUpdateMap(ItemUpdateMap, total=False): + status: VolumeStatus + status_message: str + volume_provisioning_data: str + deleted: bool + deleted_at: UpdateMapDateTime + + @dataclass class _SubmittedResult: - update_map: UpdateMap = field(default_factory=dict) + update_map: _VolumeUpdateMap = field(default_factory=_VolumeUpdateMap) async def _process_submitted_volume(volume_model: VolumeModel) -> _SubmittedResult: @@ -363,8 +378,13 @@ async def _process_to_be_deleted_item(item: VolumePipelineItem): return result = await _process_to_be_deleted_volume(volume_model) - update_map = result.update_map | get_unlock_update_map() + update_map = _VolumeUpdateMap() + update_map.update(result.update_map) + set_processed_update_map_fields(update_map) + set_unlock_update_map_fields(update_map) async with get_session_ctx() as session: + now = get_current_datetime() + resolve_now_placeholders(update_map, now=now) res = await session.execute( update(VolumeModel) .where( @@ -392,11 +412,11 @@ async def _process_to_be_deleted_item(item: VolumePipelineItem): @dataclass -class _DeletedResult: - update_map: UpdateMap = field(default_factory=dict) +class _ProcessToBeDeletedResult: + update_map: _VolumeUpdateMap = field(default_factory=_VolumeUpdateMap) -async def _process_to_be_deleted_volume(volume_model: VolumeModel) -> _DeletedResult: +async def _process_to_be_deleted_volume(volume_model: VolumeModel) -> _ProcessToBeDeletedResult: volume = volume_model_to_volume(volume_model) if volume.external: return _get_deleted_result() @@ -437,12 +457,10 @@ async def _process_to_be_deleted_volume(volume_model: VolumeModel) -> _DeletedRe return _get_deleted_result() -def _get_deleted_result() -> _DeletedResult: - now = get_current_datetime() - return _DeletedResult( +def _get_deleted_result() -> _ProcessToBeDeletedResult: + return _ProcessToBeDeletedResult( update_map={ - "last_processed_at": now, "deleted": True, - "deleted_at": now, + "deleted_at": NOW_PLACEHOLDER, } ) diff --git a/src/dstack/_internal/server/background/scheduled_tasks/__init__.py b/src/dstack/_internal/server/background/scheduled_tasks/__init__.py index 45ae8ec7fd..9c7cd6ac1a 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/__init__.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/__init__.py @@ -102,13 +102,13 @@ def start_scheduled_tasks() -> AsyncIOScheduler: _scheduler.add_job( process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1 ) - _scheduler.add_job( - process_fleets, - IntervalTrigger(seconds=10, jitter=2), - max_instances=1, - ) _scheduler.add_job(delete_instance_health_checks, IntervalTrigger(minutes=5), max_instances=1) if not FeatureFlags.PIPELINE_PROCESSING_ENABLED: + _scheduler.add_job( + process_fleets, + IntervalTrigger(seconds=10, jitter=2), + max_instances=1, + ) _scheduler.add_job( process_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5 ) diff --git a/src/dstack/_internal/server/background/scheduled_tasks/compute_groups.py b/src/dstack/_internal/server/background/scheduled_tasks/compute_groups.py index feb1cc5070..58d6b2c8b7 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/compute_groups.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/compute_groups.py @@ -32,6 +32,8 @@ TERMINATION_RETRY_MAX_DURATION = timedelta(minutes=15) +# NOTE: This scheduled task is going to be deprecated in favor of `ComputeGroupPipeline`. +# If this logic changes before removal, keep `pipeline_tasks/compute_groups.py` in sync. async def process_compute_groups(batch_size: int = 1): tasks = [] for _ in range(batch_size): diff --git a/src/dstack/_internal/server/background/scheduled_tasks/fleets.py b/src/dstack/_internal/server/background/scheduled_tasks/fleets.py index a758f86ada..6b1ba7667a 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/fleets.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/fleets.py @@ -5,10 +5,11 @@ from sqlalchemy import select, update from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import joinedload, load_only, selectinload, with_loader_criteria +from sqlalchemy.orm import joinedload, load_only, selectinload from dstack._internal.core.models.fleets import FleetSpec, FleetStatus from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.core.models.runs import RunStatus from dstack._internal.server.db import get_db, get_session_ctx from dstack._internal.server.models import ( FleetModel, @@ -39,6 +40,8 @@ MIN_PROCESSING_INTERVAL = timedelta(seconds=30) +# NOTE: This scheduled task is going to be deprecated in favor of `FleetPipeline`. +# If this logic changes before removal, keep `pipeline_tasks/fleets.py` in sync. @sentry_utils.instrument_scheduled_task async def process_fleets(): fleet_lock, fleet_lockset = get_locker(get_db().dialect_name).get_lockset( @@ -59,10 +62,9 @@ async def process_fleets(): ) .options( load_only(FleetModel.id, FleetModel.name), - selectinload(FleetModel.instances).load_only(InstanceModel.id), - with_loader_criteria( - InstanceModel, InstanceModel.deleted == False, include_aliases=True - ), + selectinload( + FleetModel.instances.and_(InstanceModel.deleted == False) + ).load_only(InstanceModel.id), ) .order_by(FleetModel.last_processed_at.asc()) .limit(BATCH_SIZE) @@ -115,14 +117,17 @@ async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel]) res = await session.execute( select(FleetModel) .where(FleetModel.id.in_(fleet_ids)) + .options(joinedload(FleetModel.project)) .options( - joinedload(FleetModel.instances).joinedload(InstanceModel.jobs).load_only(JobModel.id), - with_loader_criteria( - InstanceModel, InstanceModel.deleted == False, include_aliases=True - ), + selectinload(FleetModel.instances.and_(InstanceModel.deleted == False)) + .joinedload(InstanceModel.jobs) + .load_only(JobModel.id), + ) + .options( + selectinload( + FleetModel.runs.and_(RunModel.status.not_in(RunStatus.finished_statuses())) + ).load_only(RunModel.status) ) - .options(joinedload(FleetModel.project)) - .options(joinedload(FleetModel.runs).load_only(RunModel.status)) .execution_options(populate_existing=True) ) fleet_models = list(res.unique().scalars().all()) diff --git a/src/dstack/_internal/server/background/scheduled_tasks/gateways.py b/src/dstack/_internal/server/background/scheduled_tasks/gateways.py index fc12e8e3b8..262f45a180 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/gateways.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/gateways.py @@ -35,6 +35,8 @@ async def process_gateways_connections(): await _process_active_connections() +# NOTE: This scheduled task is going to be deprecated in favor of `GatewayPipeline`. +# If this logic changes before removal, keep `pipeline_tasks/gateways.py` in sync. @sentry_utils.instrument_scheduled_task async def process_gateways(): lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__) diff --git a/src/dstack/_internal/server/background/scheduled_tasks/placement_groups.py b/src/dstack/_internal/server/background/scheduled_tasks/placement_groups.py index 71ab51b07b..1106ce4912 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/placement_groups.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/placement_groups.py @@ -19,6 +19,8 @@ logger = get_logger(__name__) +# NOTE: This scheduled task is going to be deprecated in favor of `PlacementGroupPipeline`. +# If this logic changes before removal, keep `pipeline_tasks/placement_groups.py` in sync. @sentry_utils.instrument_scheduled_task async def process_placement_groups(): lock, lockset = get_locker(get_db().dialect_name).get_lockset( diff --git a/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py b/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py index 5d1b2e1a79..151f07deeb 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py @@ -13,7 +13,6 @@ load_only, noload, selectinload, - with_loader_criteria, ) from dstack._internal.core.backends.base.backend import Backend @@ -223,9 +222,8 @@ async def _process_submitted_job( .where(JobModel.id == job_model.id) .options(joinedload(JobModel.instance)) .options( - joinedload(JobModel.fleet).joinedload(FleetModel.instances), - with_loader_criteria( - InstanceModel, InstanceModel.deleted == False, include_aliases=True + joinedload(JobModel.fleet).selectinload( + FleetModel.instances.and_(InstanceModel.deleted == False) ), ) ) @@ -236,9 +234,8 @@ async def _process_submitted_job( .options(joinedload(RunModel.project).joinedload(ProjectModel.backends)) .options(joinedload(RunModel.user).load_only(UserModel.name)) .options( - joinedload(RunModel.fleet).joinedload(FleetModel.instances), - with_loader_criteria( - InstanceModel, InstanceModel.deleted == False, include_aliases=True + joinedload(RunModel.fleet).selectinload( + FleetModel.instances.and_(InstanceModel.deleted == False) ), ) ) @@ -584,6 +581,8 @@ async def _fetch_fleet_with_master_instance_provisioning_data( # To avoid violating fleet placement cluster during master provisioning, # we must lock empty fleets and respect existing instances in non-empty fleets. # On SQLite always take the lock during master provisioning for simplicity. + # It's fine to lock fleets currently locked by pipelines (with lock_* fields set) + # since we won't update fleets – we only need to ensure there is no parallel provisioning. await exit_stack.enter_async_context( get_locker(get_db().dialect_name).lock_ctx( FleetModel.__tablename__, [fleet_model.id] diff --git a/src/dstack/_internal/server/background/scheduled_tasks/volumes.py b/src/dstack/_internal/server/background/scheduled_tasks/volumes.py index a61f796947..11e6f3c591 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/volumes.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/volumes.py @@ -24,6 +24,8 @@ logger = get_logger(__name__) +# NOTE: This scheduled task is going to be deprecated in favor of `VolumePipeline`. +# If this logic changes before removal, keep `pipeline_tasks/volumes.py` in sync. @sentry_utils.instrument_scheduled_task async def process_submitted_volumes(): lock, lockset = get_locker(get_db().dialect_name).get_lockset(VolumeModel.__tablename__) diff --git a/src/dstack/_internal/server/migrations/versions/2026/02_27_1218_d21d3e61de27_add_fleetmodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/2026/02_27_1218_d21d3e61de27_add_fleetmodel_pipeline_columns.py new file mode 100644 index 0000000000..fad3da7909 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/02_27_1218_d21d3e61de27_add_fleetmodel_pipeline_columns.py @@ -0,0 +1,47 @@ +"""Add FleetModel pipeline columns + +Revision ID: d21d3e61de27 +Revises: 9a363c3cbe04 +Create Date: 2026-02-27 12:18:01.768776+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "d21d3e61de27" +down_revision = "9a363c3cbe04" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_02_0530_46150101edec_add_ix_fleets_pipeline_fetch_q_index.py b/src/dstack/_internal/server/migrations/versions/2026/03_02_0530_46150101edec_add_ix_fleets_pipeline_fetch_q_index.py new file mode 100644 index 0000000000..365aac41cf --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_02_0530_46150101edec_add_ix_fleets_pipeline_fetch_q_index.py @@ -0,0 +1,49 @@ +"""Add ix_fleets_pipeline_fetch_q index + +Revision ID: 46150101edec +Revises: d21d3e61de27 +Create Date: 2026-03-02 05:30:07.196407+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "46150101edec" +down_revision = "d21d3e61de27" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_fleets_pipeline_fetch_q", + table_name="fleets", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_fleets_pipeline_fetch_q", + "fleets", + [sa.literal_column("last_processed_at ASC")], + unique=False, + sqlite_where=sa.text("deleted = 0"), + postgresql_where=sa.text("deleted IS FALSE"), + postgresql_concurrently=True, + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_fleets_pipeline_fetch_q", + table_name="fleets", + if_exists=True, + postgresql_concurrently=True, + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index a7a8ec0bd6..15c5488da5 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -576,7 +576,7 @@ class PoolModel(BaseModel): instances: Mapped[List["InstanceModel"]] = relationship(back_populates="pool", lazy="selectin") -class FleetModel(BaseModel): +class FleetModel(PipelineModelMixin, BaseModel): __tablename__ = "fleets" id: Mapped[uuid.UUID] = mapped_column( @@ -604,9 +604,20 @@ class FleetModel(BaseModel): jobs: Mapped[List["JobModel"]] = relationship(back_populates="fleet") instances: Mapped[List["InstanceModel"]] = relationship(back_populates="fleet") + # `consolidation_attempt` counts how many times in a row fleet needed consolidation. + # Allows increasing delays between attempts. consolidation_attempt: Mapped[int] = mapped_column(Integer, server_default="0") last_consolidated_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + __table_args__ = ( + Index( + "ix_fleets_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=deleted == false(), + sqlite_where=deleted == false(), + ), + ) + class InstanceModel(BaseModel): __tablename__ = "instances" diff --git a/src/dstack/_internal/server/services/fleets.py b/src/dstack/_internal/server/services/fleets.py index c0483ebb61..c0ec21aeaa 100644 --- a/src/dstack/_internal/server/services/fleets.py +++ b/src/dstack/_internal/server/services/fleets.py @@ -102,11 +102,45 @@ def switch_fleet_status( return fleet_model.status = new_status + emit_fleet_status_change_event( + session=session, + fleet_model=fleet_model, + old_status=old_status, + new_status=new_status, + status_message=fleet_model.status_message, + actor=actor, + ) - msg = f"Fleet status changed {old_status.upper()} -> {new_status.upper()}" + +def emit_fleet_status_change_event( + session: AsyncSession, + fleet_model: FleetModel, + old_status: FleetStatus, + new_status: FleetStatus, + status_message: Optional[str], + actor: events.AnyActor = events.SystemActor(), +) -> None: + if old_status == new_status: + return + msg = get_fleet_status_change_message( + old_status=old_status, + new_status=new_status, + status_message=status_message, + ) events.emit(session, msg, actor=actor, targets=[events.Target.from_model(fleet_model)]) +def get_fleet_status_change_message( + old_status: FleetStatus, + new_status: FleetStatus, + status_message: Optional[str], +) -> str: + msg = f"Fleet status changed {old_status.upper()} -> {new_status.upper()}" + if status_message is not None: + msg += f" ({status_message})" + return msg + + async def list_projects_with_no_active_fleets( session: AsyncSession, user: UserModel, @@ -225,7 +259,7 @@ async def list_projects_fleet_models( .where(*filters) .order_by(*order_by) .limit(limit) - .options(joinedload(FleetModel.instances.and_(InstanceModel.deleted == False))) + .options(selectinload(FleetModel.instances.and_(InstanceModel.deleted == False))) ) fleet_models = list(res.unique().scalars().all()) return fleet_models @@ -256,7 +290,7 @@ async def list_project_fleet_models( res = await session.execute( select(FleetModel) .where(*filters) - .options(joinedload(FleetModel.instances.and_(InstanceModel.deleted == False))) + .options(selectinload(FleetModel.instances.and_(InstanceModel.deleted == False))) ) return list(res.unique().scalars().all()) @@ -485,13 +519,24 @@ async def apply_plan( .joinedload(InstanceModel.jobs) .load_only(JobModel.id) ) - .options(selectinload(FleetModel.runs)) + # `is_fleet_in_use()` only needs active run presence/status. + .options( + selectinload( + FleetModel.runs.and_(RunModel.status.not_in(RunStatus.finished_statuses())) + ).load_only(RunModel.id, RunModel.status) + ) .execution_options(populate_existing=True) .order_by(FleetModel.id) # take locks in order .with_for_update(key_share=True) ) fleet_model = res.scalars().unique().one_or_none() if fleet_model is not None: + if fleet_model.lock_expires_at is not None: + # TODO: Make the endpoint fully async so we don't need to lock and error: + # put the request in queue and process in the background. + raise ServerClientError( + "Failed to update fleet: fleet is being processed currently. Try again later." + ) return await _update_fleet( session=session, user=user, @@ -629,8 +674,7 @@ async def delete_fleets( FleetModel.name.in_(names), FleetModel.deleted == False, ) - .order_by(FleetModel.id) # take locks in order - .with_for_update(key_share=True) + .order_by(FleetModel.id) ) fleets_ids = list(res.scalars().unique().all()) res = await session.execute( @@ -639,8 +683,7 @@ async def delete_fleets( InstanceModel.fleet_id.in_(fleets_ids), InstanceModel.deleted == False, ) - .order_by(InstanceModel.id) # take locks in order - .with_for_update(key_share=True) + .order_by(InstanceModel.id) ) instances_ids = list(res.scalars().unique().all()) if is_db_sqlite(): @@ -654,22 +697,56 @@ async def delete_fleets( # TODO: Do not lock fleet when deleting only instances. res = await session.execute( select(FleetModel) - .where(FleetModel.id.in_(fleets_ids)) + .where( + FleetModel.project_id == project.id, + FleetModel.id.in_(fleets_ids), + FleetModel.deleted == False, + FleetModel.lock_expires_at.is_(None), + ) .options( - joinedload(FleetModel.instances.and_(InstanceModel.id.in_(instances_ids))) - .joinedload(InstanceModel.jobs) + selectinload(FleetModel.instances.and_(InstanceModel.id.in_(instances_ids))) + .selectinload(InstanceModel.jobs) .load_only(JobModel.id) ) .options( - joinedload( + selectinload( FleetModel.runs.and_(RunModel.status.not_in(RunStatus.finished_statuses())) - ) + ).load_only(RunModel.status) ) .execution_options(populate_existing=True) + .order_by(FleetModel.id) # take locks in order + .with_for_update(key_share=True, of=FleetModel) ) fleet_models = res.scalars().unique().all() - fleets = [fleet_model_to_fleet(m) for m in fleet_models] - for fleet in fleets: + if len(fleet_models) != len(fleets_ids): + # TODO: Make the endpoint fully async so we don't need to lock and error: + # put the request in queue and process in the background. + msg = ( + "Failed to delete fleets: fleets are being processed currently. Try again later." + if instance_nums is None + else "Failed to delete fleet instances: fleets are being processed currently. Try again later." + ) + raise ServerClientError(msg) + res = await session.execute( + select(InstanceModel.id) + .where( + InstanceModel.id.in_(instances_ids), + InstanceModel.deleted == False, + ) + .order_by(InstanceModel.id) # take locks in order + .with_for_update(key_share=True, of=InstanceModel) + .execution_options(populate_existing=True) + ) + instance_models_ids = list(res.scalars().unique().all()) + if len(instance_models_ids) != len(instances_ids): + msg = ( + "Failed to delete fleets: fleet instances are being processed currently. Try again later." + if instance_nums is None + else "Failed to delete fleet instances: fleet instances are being processed currently. Try again later." + ) + raise ServerClientError(msg) + for fleet_model in fleet_models: + fleet = fleet_model_to_fleet(fleet_model) if fleet.spec.configuration.ssh_config is not None: _check_can_manage_ssh_fleets(user=user, project=project) if instance_nums is None: diff --git a/src/dstack/_internal/server/services/gateways/__init__.py b/src/dstack/_internal/server/services/gateways/__init__.py index 762af8bef1..ddc3d64c44 100644 --- a/src/dstack/_internal/server/services/gateways/__init__.py +++ b/src/dstack/_internal/server/services/gateways/__init__.py @@ -356,7 +356,7 @@ async def _delete_gateways_pipeline( ) gateway_models = res.scalars().all() if len(gateway_models) != len(gateways_ids): - # TODO: Make the delete endpoint fully async so we don't need to lock and error: + # TODO: Make the endpoint fully async so we don't need to lock and error: # put the request in queue and process in the background. raise ServerClientError( "Failed to delete gateways: gateways are being processed currently. Try again later." diff --git a/src/dstack/_internal/server/services/volumes.py b/src/dstack/_internal/server/services/volumes.py index f0d2fc703e..1c846c724f 100644 --- a/src/dstack/_internal/server/services/volumes.py +++ b/src/dstack/_internal/server/services/volumes.py @@ -369,7 +369,7 @@ async def _delete_volumes_pipeline( ) volume_models = res.scalars().unique().all() if len(volume_models) != len(volumes_ids): - # TODO: Make the delete endpoint fully async so we don't need to lock and error: + # TODO: Make the endpoint fully async so we don't need to lock and error: # put the request in queue and process in the background. raise ServerClientError( "Failed to delete volumes: volumes are being processed currently. Try again later." diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_fleets.py b/src/tests/_internal/server/background/pipeline_tasks/test_fleets.py new file mode 100644 index 0000000000..746ddf2ea4 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_fleets.py @@ -0,0 +1,398 @@ +import uuid +from datetime import datetime, timezone +from unittest.mock import Mock + +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.fleets import FleetNodesSpec, FleetStatus +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.runs import RunStatus +from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.background.pipeline_tasks.base import PipelineItem +from dstack._internal.server.background.pipeline_tasks.fleets import ( + FleetWorker, +) +from dstack._internal.server.models import FleetModel, InstanceModel +from dstack._internal.server.services.projects import add_project_member +from dstack._internal.server.testing.common import ( + create_fleet, + create_instance, + create_placement_group, + create_project, + create_repo, + create_run, + create_user, + get_fleet_spec, +) + + +@pytest.fixture +def worker() -> FleetWorker: + return FleetWorker(queue=Mock(), heartbeater=Mock()) + + +def _fleet_to_pipeline_item(fleet: FleetModel) -> PipelineItem: + assert fleet.lock_token is not None + assert fleet.lock_expires_at is not None + return PipelineItem( + __tablename__=fleet.__tablename__, + id=fleet.id, + lock_token=fleet.lock_token, + lock_expires_at=fleet.lock_expires_at, + prev_lock_expired=False, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestFleetWorker: + async def test_deletes_empty_autocreated_fleet( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.autocreated = True + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.deleted + + async def test_deletes_terminating_user_fleet( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.autocreated = False + fleet = await create_fleet( + session=session, + project=project, + status=FleetStatus.TERMINATING, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.deleted + + async def test_does_not_delete_fleet_with_active_run( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + ) + user = await create_user(session=session, global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + repo = await create_repo( + session=session, + project_id=project.id, + ) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + status=RunStatus.RUNNING, + ) + fleet.runs.append(run) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert not fleet.deleted + + async def test_does_not_delete_fleet_with_instance( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + ) + user = await create_user(session=session, global_role=GlobalRole.USER) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + fleet.instances.append(instance) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert not fleet.deleted + + async def test_consolidation_creates_missing_instances( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=2, target=2, max=2) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=1, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + instances = (await session.execute(select(InstanceModel))).scalars().all() + assert len(instances) == 2 + assert {i.instance_num for i in instances} == {0, 1} + assert fleet.consolidation_attempt == 1 + + async def test_consolidation_terminates_redundant_instances( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + instance1 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + instance_num=0, + ) + instance2 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=1, + ) + instance3 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + instance_num=2, + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(instance1) + await session.refresh(instance2) + await session.refresh(instance3) + assert instance1.status == InstanceStatus.BUSY + assert instance2.status == InstanceStatus.TERMINATING + assert instance3.deleted + assert fleet.consolidation_attempt == 1 + + async def test_consolidation_attempt_increments_when_over_max_and_no_idle_instances( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + instance1 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + instance_num=0, + ) + instance2 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.BUSY, + instance_num=1, + ) + + fleet.consolidation_attempt = 2 + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(instance1) + await session.refresh(instance2) + assert instance1.status == InstanceStatus.BUSY + assert instance2.status == InstanceStatus.BUSY + assert fleet.consolidation_attempt == 3 + + async def test_marks_placement_groups_fleet_deleted_on_fleet_delete( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + status=FleetStatus.TERMINATING, + ) + placement_group1 = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="test-pg-1", + ) + placement_group2 = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="test-pg-2", + ) + + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(placement_group1) + await session.refresh(placement_group2) + assert fleet.deleted + assert placement_group1.fleet_deleted + assert placement_group2.fleet_deleted + + async def test_consolidation_respects_retry_delay( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=2, target=2, max=2) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + fleet.consolidation_attempt = 1 + fleet.last_consolidated_at = datetime.now(timezone.utc) + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + instances = ( + ( + await session.execute( + select(InstanceModel).where( + InstanceModel.fleet_id == fleet.id, + InstanceModel.deleted == False, + ) + ) + ) + .scalars() + .all() + ) + assert len(instances) == 1 + assert fleet.consolidation_attempt == 1 + assert not fleet.deleted + + async def test_consolidation_attempt_resets_when_no_changes( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + fleet.consolidation_attempt = 3 + previous_last_consolidated_at = datetime(2023, 1, 2, 3, 4, tzinfo=timezone.utc) + fleet.last_consolidated_at = previous_last_consolidated_at + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + instances = ( + ( + await session.execute( + select(InstanceModel).where( + InstanceModel.fleet_id == fleet.id, + InstanceModel.deleted == False, + ) + ) + ) + .scalars() + .all() + ) + assert len(instances) == 1 + assert fleet.consolidation_attempt == 0 + assert ( + fleet.last_consolidated_at is not None + and fleet.last_consolidated_at > previous_last_consolidated_at + ) diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py b/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py index 9628451bdc..59cbd370e9 100644 --- a/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py +++ b/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py @@ -257,6 +257,7 @@ async def test_keeps_gateway_if_terminate_fails( ) gateway.lock_token = uuid.uuid4() gateway.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + gateway.lock_owner = "GatewayPipeline" gateway.to_be_deleted = True original_last_processed_at = gateway.last_processed_at await session.commit() @@ -286,6 +287,9 @@ async def test_keeps_gateway_if_terminate_fails( await session.refresh(gateway_compute) assert gateway.to_be_deleted is True assert gateway.last_processed_at > original_last_processed_at + assert gateway.lock_token is None + assert gateway.lock_expires_at is None + assert gateway.lock_owner is None assert gateway_compute.active is True assert gateway_compute.deleted is False events = await list_events(session) diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py b/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py index 7baed58b64..c23d5e604d 100644 --- a/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py +++ b/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py @@ -5,6 +5,7 @@ import pytest from sqlalchemy.ext.asyncio import AsyncSession +from dstack._internal.core.errors import PlacementGroupInUseError from dstack._internal.server.background.pipeline_tasks.base import PipelineItem from dstack._internal.server.background.pipeline_tasks.placement_groups import PlacementGroupWorker from dstack._internal.server.models import PlacementGroupModel @@ -62,3 +63,41 @@ async def test_deletes_placement_group( aws_mock.compute.return_value.delete_placement_group.assert_called_once() await session.refresh(placement_group) assert placement_group.deleted + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_retries_placement_group_deletion_if_still_in_use( + self, test_db, session: AsyncSession, worker: PlacementGroupWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + ) + placement_group = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="test2-pg", + fleet_deleted=True, + ) + placement_group.lock_token = uuid.uuid4() + placement_group.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + placement_group.lock_owner = "PlacementGroupPipeline" + original_last_processed_at = placement_group.last_processed_at + await session.commit() + with patch("dstack._internal.server.services.backends.get_project_backend_by_type") as m: + aws_mock = Mock() + m.return_value = aws_mock + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + aws_mock.compute.return_value.delete_placement_group.side_effect = ( + PlacementGroupInUseError() + ) + await worker.process(_placement_group_to_pipeline_item(placement_group)) + aws_mock.compute.return_value.delete_placement_group.assert_called_once() + await session.refresh(placement_group) + assert not placement_group.deleted + assert placement_group.last_processed_at > original_last_processed_at + assert placement_group.lock_token is None + assert placement_group.lock_expires_at is None + assert placement_group.lock_owner is None diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_fleets.py b/src/tests/_internal/server/background/scheduled_tasks/test_fleets.py index 2ef1b27ab9..2136a2c963 100644 --- a/src/tests/_internal/server/background/scheduled_tasks/test_fleets.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_fleets.py @@ -154,8 +154,17 @@ async def test_consolidation_terminates_redundant_instances( status=InstanceStatus.IDLE, instance_num=1, ) + instance3 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + instance_num=2, + ) await process_fleets() await session.refresh(instance1) await session.refresh(instance2) + await session.refresh(instance3) assert instance1.status == InstanceStatus.BUSY assert instance2.status == InstanceStatus.TERMINATING + assert instance3.deleted diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py index 1a250612ba..02a4430b7d 100644 --- a/src/tests/_internal/server/routers/test_fleets.py +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -931,6 +931,37 @@ async def test_returns_400_when_fleets_in_use( assert not fleet.deleted assert instance.status == InstanceStatus.BUSY + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_when_fleet_locked( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + ) + fleet.instances.append(instance) + fleet.lock_expires_at = datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc) + await session.commit() + + response = await client.post( + f"/api/project/{project.name}/fleets/delete", + headers=get_auth_headers(user.token), + json={"names": [fleet.name]}, + ) + assert response.status_code == 400 + + await session.refresh(fleet) + await session.refresh(instance) + assert fleet.status != FleetStatus.TERMINATING + assert instance.status != InstanceStatus.TERMINATING + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_forbids_if_no_permission_to_manage_ssh_fleets( @@ -1057,6 +1088,38 @@ async def test_returns_400_when_deleting_busy_instances( assert instance.status != InstanceStatus.TERMINATING assert fleet.status != FleetStatus.TERMINATING + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_when_fleet_locked( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + instance = await create_instance( + session=session, + project=project, + instance_num=1, + ) + fleet.instances.append(instance) + fleet.lock_expires_at = datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc) + await session.commit() + + response = await client.post( + f"/api/project/{project.name}/fleets/delete_instances", + headers=get_auth_headers(user.token), + json={"name": fleet.name, "instance_nums": [1]}, + ) + assert response.status_code == 400 + + await session.refresh(fleet) + await session.refresh(instance) + assert fleet.status != FleetStatus.TERMINATING + assert instance.status != InstanceStatus.TERMINATING + class TestGetPlan: @pytest.mark.asyncio From 6d61113cb7f5944d6563384d60db788947b22284 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Mon, 2 Mar 2026 08:06:46 +0000 Subject: [PATCH 176/187] Replace RunPod -> Runpod in docs, blog, comments (#3625) https://www.runpod.io/blog/its-runpod-not-runpod-a-message-for-large-language-models-and-the-humans-who-love-them --- .../blog/posts/amd-mi300x-inference-benchmark.md | 4 ++-- docs/blog/posts/amd-on-runpod.md | 14 +++++++------- ...yond-kubernetes-2024-recap-and-whats-ahead.md | 4 ++-- docs/blog/posts/dstack-sky-own-cloud-accounts.md | 2 +- docs/blog/posts/state-of-cloud-gpu-2025.md | 4 ++-- docs/blog/posts/toffee.md | 4 ++-- docs/blog/posts/volumes-on-runpod.md | 16 ++++++++-------- docs/docs/concepts/backends.md | 4 ++-- docs/docs/concepts/snippets/manage-fleets.ext | 2 +- docs/docs/guides/migration/slurm.md | 4 ++-- docs/docs/guides/protips.md | 2 +- examples/accelerators/amd/README.md | 6 +++--- .../_internal/core/backends/runpod/api_client.py | 4 ++-- .../_internal/core/backends/runpod/compute.py | 2 +- .../_internal/core/backends/runpod/models.py | 2 +- 15 files changed, 37 insertions(+), 37 deletions(-) diff --git a/docs/blog/posts/amd-mi300x-inference-benchmark.md b/docs/blog/posts/amd-mi300x-inference-benchmark.md index bc747ee781..18b8d343c6 100644 --- a/docs/blog/posts/amd-mi300x-inference-benchmark.md +++ b/docs/blog/posts/amd-mi300x-inference-benchmark.md @@ -217,8 +217,8 @@ is the primary sponsor of this benchmark, and we are sincerely grateful for thei If you'd like to use top-tier bare metal compute with AMD GPUs, we recommend going with Hot Aisle. Once you gain access to a cluster, it can be easily accessed via `dstack`'s [SSH fleet](../../docs/concepts/fleets.md#ssh-fleets) easily. -### RunPod +### Runpod If you’d like to use on-demand compute with AMD GPUs at affordable prices, you can configure `dstack` to -use [RunPod](https://runpod.io/). In +use [Runpod](https://runpod.io/). In this case, `dstack` will be able to provision fleets automatically when you run dev environments, tasks, and services. diff --git a/docs/blog/posts/amd-on-runpod.md b/docs/blog/posts/amd-on-runpod.md index c1ff25015b..0d5c60b4e9 100644 --- a/docs/blog/posts/amd-on-runpod.md +++ b/docs/blog/posts/amd-on-runpod.md @@ -1,25 +1,25 @@ --- -title: Supporting AMD accelerators on RunPod +title: Supporting AMD accelerators on Runpod date: 2024-08-21 -description: "dstack, the open-source AI container orchestration platform, adds support for AMD accelerators, with RunPod as the first supported cloud provider." +description: "dstack, the open-source AI container orchestration platform, adds support for AMD accelerators, with Runpod as the first supported cloud provider." slug: amd-on-runpod categories: - Changelog --- -# Supporting AMD accelerators on RunPod +# Supporting AMD accelerators on Runpod While `dstack` helps streamline the orchestration of containers for AI, its primary goal is to offer vendor independence and portability, ensuring compatibility across different hardware and cloud providers. -Inspired by the recent `MI300X` benchmarks, we are pleased to announce that RunPod is the first cloud provider to offer +Inspired by the recent `MI300X` benchmarks, we are pleased to announce that Runpod is the first cloud provider to offer AMD GPUs through `dstack`, with support for other cloud providers and on-prem servers to follow. ## Specification -For the reference, below is a comparison of the `MI300X` and `H100 SXM` specs, incl. the prices offered by RunPod. +For the reference, below is a comparison of the `MI300X` and `H100 SXM` specs, incl. the prices offered by Runpod. | | MI300X | H100X SXM | |---------------------------------|-------------------------------------------|--------------| @@ -113,8 +113,8 @@ cloud resources and run the configuration. 1. The examples above demonstrate the use of [TGI](https://huggingface.co/docs/text-generation-inference/en/installation_amd). AMD accelerators can also be used with other frameworks like vLLM, Ollama, etc., and we'll be adding more examples soon. -2. RunPod is the first cloud provider where dstack supports AMD. More cloud providers will be supported soon as well. -3. Want to give RunPod and `dstack` a try? Make sure you've signed up for [RunPod](https://www.runpod.io/), +2. Runpod is the first cloud provider where dstack supports AMD. More cloud providers will be supported soon as well. +3. Want to give Runpod and `dstack` a try? Make sure you've signed up for [Runpod](https://www.runpod.io/), then [set up](../../docs/reference/server/config.yml.md#runpod) the `dstack server`. > Have questioned or feedback? Join our [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md b/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md index 4c6b43f9bf..9d32f336b0 100644 --- a/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md +++ b/docs/blog/posts/beyond-kubernetes-2024-recap-and-whats-ahead.md @@ -22,7 +22,7 @@ While `dstack` integrates with leading cloud GPU providers, we aim to expand par sharing our vision of simplifying AI infrastructure orchestration with a lightweight, efficient alternative to Kubernetes. This year, we’re excited to welcome our first partners: [Lambda](https://lambdalabs.com/), -[RunPod](https://www.runpod.io/), +[Runpod](https://www.runpod.io/), [CUDO Compute](https://www.cudocompute.com/), and [Hot Aisle](https://hotaisle.xyz/). @@ -114,7 +114,7 @@ This year, we’re particularly proud of our newly added integration with AMD. `dstack` works seamlessly with any on-prem AMD clusters. For example, you can rent such servers through our partner [Hot Aisle](https://hotaisle.xyz/). -> Among cloud providers, [AMD](https://www.amd.com/en/products/accelerators/instinct.html) is supported only through RunPod. In Q1 2025, we plan to extend it to +> Among cloud providers, [AMD](https://www.amd.com/en/products/accelerators/instinct.html) is supported only through Runpod. In Q1 2025, we plan to extend it to [Nscale](https://www.nscale.com/), > [Hot Aisle](https://hotaisle.xyz/), and potentially other providers open to collaboration. diff --git a/docs/blog/posts/dstack-sky-own-cloud-accounts.md b/docs/blog/posts/dstack-sky-own-cloud-accounts.md index 13c927a313..8fe8c9c4e4 100644 --- a/docs/blog/posts/dstack-sky-own-cloud-accounts.md +++ b/docs/blog/posts/dstack-sky-own-cloud-accounts.md @@ -25,7 +25,7 @@ To use your own cloud account, open the project settings and edit the correspond ![dstack-sky-banner.png](https://raw.githubusercontent.com/dstackai/static-assets/main/static-assets/images/dstack-sky-edit-backend-config.png){ width=650 } You can configure your cloud accounts for any of the supported providers, including AWS, GCP, Azure, TensorDock, Lambda, -CUDO, RunPod, and Vast.ai. +CUDO, Runpod, and Vast.ai. Additionally, you can disable certain backends if you do not plan to use them. diff --git a/docs/blog/posts/state-of-cloud-gpu-2025.md b/docs/blog/posts/state-of-cloud-gpu-2025.md index 238926ebf8..b9add79156 100644 --- a/docs/blog/posts/state-of-cloud-gpu-2025.md +++ b/docs/blog/posts/state-of-cloud-gpu-2025.md @@ -28,7 +28,7 @@ These axes split providers into distinct archetypes—each with different econom | :---- | :---- | :---- | | **Classical hyperscalers** | General-purpose clouds with GPU SKUs bolted on | AWS, Google Cloud, Azure, OCI | | **Massive neoclouds** | GPU-first operators built around dense HGX or MI-series clusters | CoreWeave, Lambda, Nebius, Crusoe | -| **Rapidly-catching neoclouds** | Smaller GPU-first players building out aggressively | RunPod, DataCrunch, Voltage Park, TensorWave, Hot Aisle | +| **Rapidly-catching neoclouds** | Smaller GPU-first players building out aggressively | Runpod, DataCrunch, Voltage Park, TensorWave, Hot Aisle | | **Cloud marketplaces** | Don’t own capacity; sell orchestration + unified API over multiple backends | NVIDIA DGX Cloud (Lepton), Modal, Lightning AI, dstack Sky | | **DC aggregators** | Aggregate idle capacity from third-party datacenters, pricing via market dynamics | Vast.ai | @@ -89,7 +89,7 @@ For comparison, below is the price range for H100×GPU clusters across providers -> Most hyperscalers and neoclouds need short- or long-term contracts, though providers like RunPod, DataCrunch, and Nebius offer on-demand clusters. Larger capacity and longer commitments bring bigger discounts — Nebius offers up to 35% off for longer terms. +> Most hyperscalers and neoclouds need short- or long-term contracts, though providers like Runpod, DataCrunch, and Nebius offer on-demand clusters. Larger capacity and longer commitments bring bigger discounts — Nebius offers up to 35% off for longer terms. ## New GPU generations – why they matter diff --git a/docs/blog/posts/toffee.md b/docs/blog/posts/toffee.md index 3854937e53..190ecf8c27 100644 --- a/docs/blog/posts/toffee.md +++ b/docs/blog/posts/toffee.md @@ -20,7 +20,7 @@ In a recent engineering [blog post](https://research.toffee.ai/blog/how-we-use-d [Toffee](https://toffee.ai) builds AI-powered experiences backed by LLMs and image-generation models. To serve these workloads efficiently, they combine: -- **GPU neoclouds** such as [RunPod](https://www.runpod.io/) and [Vast.ai](https://vast.ai/) for flexible, cost-efficient GPU capacity +- **GPU neoclouds** such as [Runpod](https://www.runpod.io/) and [Vast.ai](https://vast.ai/) for flexible, cost-efficient GPU capacity - **AWS** for core, non-AI services and backend infrastructure - **dstack** as the orchestration layer that provisions GPU resources and exposes AI models via `dstack` [services](../../docs/concepts/services.md) and [gateways](../../docs/concepts/gateways.md) @@ -68,7 +68,7 @@ Beyond oechestration, Toffee relies on `dstack`’s UI as a central observabilit -> *Thanks to dstack’s seamless integration with GPU neoclouds like RunPod and Vast.ai, we’ve been able to shift most workloads off hyperscalers — reducing our effective GPU spend by roughly 2–3× without changing a single line of model code.* +> *Thanks to dstack’s seamless integration with GPU neoclouds like Runpod and Vast.ai, we’ve been able to shift most workloads off hyperscalers — reducing our effective GPU spend by roughly 2–3× without changing a single line of model code.* > > *— [Nikita Shupeyko](https://www.linkedin.com/in/nikita-shupeyko/), AI/ML & Cloud Infrastructure Architect at Toffee* diff --git a/docs/blog/posts/volumes-on-runpod.md b/docs/blog/posts/volumes-on-runpod.md index de0c8d6d0a..c17faf7b13 100644 --- a/docs/blog/posts/volumes-on-runpod.md +++ b/docs/blog/posts/volumes-on-runpod.md @@ -1,24 +1,24 @@ --- -title: Using volumes to optimize cold starts on RunPod +title: Using volumes to optimize cold starts on Runpod date: 2024-08-13 -description: "Learn how to use volumes with dstack to optimize model inference cold start times on RunPod." +description: "Learn how to use volumes with dstack to optimize model inference cold start times on Runpod." slug: volumes-on-runpod categories: - Changelog --- -# Using volumes to optimize cold starts on RunPod +# Using volumes to optimize cold starts on Runpod Deploying custom models in the cloud often faces the challenge of cold start times, including the time to provision a new instance and download the model. This is especially relevant for services with autoscaling when new model replicas need to be provisioned quickly. Let's explore how `dstack` optimizes this process using volumes, with an example of -deploying a model on RunPod. +deploying a model on Runpod. -Suppose you want to deploy Llama 3.1 on RunPod as a [service](../../docs/concepts/services.md): +Suppose you want to deploy Llama 3.1 on Runpod as a [service](../../docs/concepts/services.md):
    @@ -59,9 +59,9 @@ When starting each replica, `text-generation-launcher` downloads the model to th usually takes under a minute, but larger models may take longer. Repeated downloads can significantly affect auto-scaling efficiency. -Great news: RunPod supports network volumes, which we can use for caching models across multiple replicas. +Great news: Runpod supports network volumes, which we can use for caching models across multiple replicas. -With `dstack`, you can create a RunPod volume using the following configuration: +With `dstack`, you can create a Runpod volume using the following configuration:
    @@ -130,7 +130,7 @@ resources: In this case, `dstack` attaches the specified volume to each new replica. This ensures the model is downloaded only once, reducing cold start time in proportion to the model size. -A notable feature of RunPod is that volumes can be attached to multiple containers simultaneously. This capability is +A notable feature of Runpod is that volumes can be attached to multiple containers simultaneously. This capability is particularly useful for auto-scalable services or distributed tasks. Using [volumes](../../docs/concepts/volumes.md) not only optimizes inference cold start times but also enhances the diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index bf731823fa..620d5723cb 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -1132,9 +1132,9 @@ projects: > To learn more, see the [Lambda](../../examples/clusters/lambda/#kubernetes) and [Crusoe](../../examples/clusters/crusoe/#kubernetes) examples. -### RunPod +### Runpod -Log into your [RunPod](https://www.runpod.io/console/) console, click Settings in the sidebar, expand the `API Keys` section, and click +Log into your [Runpod](https://www.runpod.io/console/) console, click Settings in the sidebar, expand the `API Keys` section, and click the button to create a Read & Write key. Then proceed to configuring the backend. diff --git a/docs/docs/concepts/snippets/manage-fleets.ext b/docs/docs/concepts/snippets/manage-fleets.ext index c9835fc679..b30b4126a8 100644 --- a/docs/docs/concepts/snippets/manage-fleets.ext +++ b/docs/docs/concepts/snippets/manage-fleets.ext @@ -7,4 +7,4 @@ If the run reuses an existing fleet instance, only the fleet's If an instance remains `idle`, it is automatically terminated after `idle_duration`. -> Not applied for container-based backends (Kubernetes, Vast.ai, RunPod). +> Not applied for container-based backends (Kubernetes, Vast.ai, Runpod). diff --git a/docs/docs/guides/migration/slurm.md b/docs/docs/guides/migration/slurm.md index d006497399..1020778842 100644 --- a/docs/docs/guides/migration/slurm.md +++ b/docs/docs/guides/migration/slurm.md @@ -908,7 +908,7 @@ resources: #### Network volumes -Network volumes are persistent cloud storage (AWS EBS, GCP persistent disks, RunPod volumes). +Network volumes are persistent cloud storage (AWS EBS, GCP persistent disks, Runpod volumes). Single-node task: @@ -936,7 +936,7 @@ resources:
    -Network volumes cannot be used with distributed tasks (no multi-attach support), except where multi-attach is supported (RunPod) or via volume interpolation. +Network volumes cannot be used with distributed tasks (no multi-attach support), except where multi-attach is supported (Runpod) or via volume interpolation. For distributed tasks, use interpolation to attach different volumes to each node. diff --git a/docs/docs/guides/protips.md b/docs/docs/guides/protips.md index 4aa5df93fb..dcf3fe1966 100644 --- a/docs/docs/guides/protips.md +++ b/docs/docs/guides/protips.md @@ -218,7 +218,7 @@ If the run reuses an existing fleet instance, only the fleet's If an instance remains `idle`, it is automatically terminated after `idle_duration`. -> Not applied for container-based backends (Kubernetes, Vast.ai, RunPod). +> Not applied for container-based backends (Kubernetes, Vast.ai, Runpod). ## Volumes diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md index a660acddc7..9dfe364410 100644 --- a/examples/accelerators/amd/README.md +++ b/examples/accelerators/amd/README.md @@ -55,7 +55,7 @@ Llama 3.1 70B in FP16 using [TGI](https://huggingface.co/docs/text-generation-in type: service name: llama31-service-vllm-amd - # Using RunPod's ROCm Docker image + # Using Runpod's ROCm Docker image image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04 # Required environment variables env: @@ -125,7 +125,7 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by type: task name: trl-amd-llama31-train - # Using RunPod's ROCm Docker image + # Using Runpod's ROCm Docker image image: runpod/pytorch:2.1.2-py3.10-rocm6.1-ubuntu22.04 # Required environment variables @@ -172,7 +172,7 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by # The name is optional, if not specified, generated randomly name: axolotl-amd-llama31-train - # Using RunPod's ROCm Docker image + # Using Runpod's ROCm Docker image image: runpod/pytorch:2.1.2-py3.10-rocm6.0.2-ubuntu22.04 # Required environment variables env: diff --git a/src/dstack/_internal/core/backends/runpod/api_client.py b/src/dstack/_internal/core/backends/runpod/api_client.py index 40b607aaf3..a45a294baf 100644 --- a/src/dstack/_internal/core/backends/runpod/api_client.py +++ b/src/dstack/_internal/core/backends/runpod/api_client.py @@ -108,7 +108,7 @@ def edit_pod( container_disk_in_gb: int, container_registry_auth_id: str, # Default pod volume is 20GB. - # RunPod errors if it's not specified for podEditJob. + # Runpod errors if it's not specified for podEditJob. volume_in_gb: int = 20, ) -> str: resp = self._make_request( @@ -320,7 +320,7 @@ def _make_request(self, data: Optional[Dict[str, Any]] = None) -> Response: ) response.raise_for_status() response_json = response.json() - # RunPod returns 200 on client errors + # Runpod returns 200 on client errors if "errors" in response_json: raise RunpodApiClientError(errors=response_json["errors"]) return response diff --git a/src/dstack/_internal/core/backends/runpod/compute.py b/src/dstack/_internal/core/backends/runpod/compute.py index bd5ae0e8cc..ec03362585 100644 --- a/src/dstack/_internal/core/backends/runpod/compute.py +++ b/src/dstack/_internal/core/backends/runpod/compute.py @@ -50,7 +50,7 @@ CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24 # 24 hour -# RunPod does not seem to have any limits on the disk size. +# Runpod does not seem to have any limits on the disk size. CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("1GB"), max=None) diff --git a/src/dstack/_internal/core/backends/runpod/models.py b/src/dstack/_internal/core/backends/runpod/models.py index 076d67cfa0..7bc11c2818 100644 --- a/src/dstack/_internal/core/backends/runpod/models.py +++ b/src/dstack/_internal/core/backends/runpod/models.py @@ -20,7 +20,7 @@ class RunpodBackendConfig(CoreModel): type: Literal["runpod"] = "runpod" regions: Annotated[ Optional[List[str]], - Field(description="The list of RunPod regions. Omit to use all regions"), + Field(description="The list of Runpod regions. Omit to use all regions"), ] = None community_cloud: Annotated[ Optional[bool], From 9eea926fd68f2fefae60de1a342e1e70f3d91220 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Mon, 2 Mar 2026 10:59:14 +0000 Subject: [PATCH 177/187] Extend fleet and instance permission tests (#3627) Add tests to ensure upcoming cross-project fleet sharing changes do not break existing contracts. --- .../_internal/server/routers/test_fleets.py | 68 +++++++++++++++++++ .../server/routers/test_instances.py | 30 ++++++++ 2 files changed, 98 insertions(+) diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py index 02a4430b7d..fed647d2c8 100644 --- a/src/tests/_internal/server/routers/test_fleets.py +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -192,6 +192,51 @@ async def test_returns_40x_if_not_authenticated( response = await client.post("/api/project/main/fleets/get") assert response.status_code in [401, 403] + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "by_id", [pytest.param(False, id="by-name"), pytest.param(True, id="by-id")] + ) + async def test_returns_403_on_nonexistent_fleet_in_foreign_project( + self, test_db, session: AsyncSession, client: AsyncClient, by_id: bool + ): + await create_project(session, name="test-project") + user = await create_user(session, global_role=GlobalRole.USER) # not a project member + if by_id: + body = {"id": str(uuid4())} + else: + body = {"name": "nonexistent"} + response = await client.post( + "/api/project/test-project/fleets/get", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 403 + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "by_id", [pytest.param(False, id="by-name"), pytest.param(True, id="by-id")] + ) + async def test_returns_403_on_deleted_fleet_in_foreign_project( + self, test_db, session: AsyncSession, client: AsyncClient, by_id: bool + ): + project = await create_project(session, name="test-project") + user = await create_user(session, global_role=GlobalRole.USER) # not a project member + fleet = await create_fleet( + session=session, project=project, deleted=True, name="deleted-fleet" + ) + if by_id: + body = {"id": str(fleet.id)} + else: + body = {"name": "deleted-fleet"} + response = await client.post( + "/api/project/test-project/fleets/get", + headers=get_auth_headers(user.token), + json=body, + ) + assert response.status_code == 403 + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @pytest.mark.parametrize("deleted", [False, True]) @@ -303,6 +348,29 @@ async def test_not_returns_by_name_if_fleet_does_not_exist( ) assert response.status_code == 400 + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "by_id", [pytest.param(False, id="by-name"), pytest.param(True, id="by-id")] + ) + async def test_returns_foreign_fleet_to_global_admin( + self, test_db, session: AsyncSession, client: AsyncClient, by_id: bool + ): + admin = await create_user(session, global_role=GlobalRole.ADMIN) + project = await create_project(session, name="test-project") + fleet = await create_fleet(session=session, project=project, name="test-fleet") + if by_id: + body = {"id": str(fleet.id)} + else: + body = {"name": "test-fleet"} + response = await client.post( + "/api/project/test-project/fleets/get", + headers=get_auth_headers(admin.token), + json=body, + ) + assert response.status_code == 200 + assert response.json()["name"] == "test-fleet" + class TestApplyFleetPlan: @pytest.mark.asyncio diff --git a/src/tests/_internal/server/routers/test_instances.py b/src/tests/_internal/server/routers/test_instances.py index 5f9e41df31..45363bfd92 100644 --- a/src/tests/_internal/server/routers/test_instances.py +++ b/src/tests/_internal/server/routers/test_instances.py @@ -422,6 +422,23 @@ async def test_returns_instance_by_id( assert resp_data["project_name"] == project.name assert resp_data["fleet_name"] == fleet.name + async def test_returns_instance_to_global_admin( + self, session: AsyncSession, client: AsyncClient + ) -> None: + admin = await create_user(session, global_role=GlobalRole.ADMIN, name="global-admin") + project = await create_project(session) + fleet = await create_fleet(session, project) + instance = await create_instance(session=session, project=project, fleet=fleet) + + resp = await client.post( + f"/api/project/{project.name}/instances/get", + headers=get_auth_headers(admin.token), + json={"id": str(instance.id)}, + ) + assert resp.status_code == 200 + resp_data = resp.json() + assert resp_data["id"] == str(instance.id) + async def test_returns_400_if_instance_not_found( self, session: AsyncSession, client: AsyncClient ) -> None: @@ -479,3 +496,16 @@ async def test_returns_403_if_not_project_member( json={"id": str(instance.id)}, ) assert resp.status_code == 403 + + async def test_returns_403_if_not_project_member_and_instance_not_exists( + self, session: AsyncSession, client: AsyncClient + ) -> None: + user = await create_user(session, name="non_member", global_role=GlobalRole.USER) + project = await create_project(session) + + resp = await client.post( + f"/api/project/{project.name}/instances/get", + headers=get_auth_headers(user.token), + json={"id": str(uuid.uuid4())}, + ) + assert resp.status_code == 403 From ac875f674828e38f6f0786ac86f00b3e3027e543 Mon Sep 17 00:00:00 2001 From: Dmitry Meyer Date: Tue, 3 Mar 2026 08:48:28 +0000 Subject: [PATCH 178/187] Reorganize Go codebase (#3628) The new structure: / go.mod cmd/ runner/ shim/ internal/ common/ runner/ shim/ --- runner/cmd/runner/main.go | 10 +-- runner/cmd/shim/main.go | 8 +-- .../{api/common.go => common/api/api.go} | 2 +- runner/{ => internal/common}/consts/consts.go | 0 runner/internal/common/{ => gpu}/gpu.go | 2 +- runner/internal/common/interpolator.go | 67 ------------------- runner/internal/common/interpolator_test.go | 64 ------------------ runner/internal/{ => common}/log/log.go | 0 runner/internal/common/string.go | 11 --- runner/internal/{ => common}/types/types.go | 10 --- runner/internal/common/{ => utils}/utils.go | 4 +- .../internal/common/{ => utils}/utils_test.go | 2 +- runner/internal/runner/api/http.go | 8 +-- runner/internal/runner/api/server.go | 8 +-- runner/internal/runner/api/ws.go | 2 +- .../{ => runner}/connections/connections.go | 2 +- .../connections/connections_test.go | 0 runner/internal/{ => runner}/executor/base.go | 8 +-- runner/internal/{ => runner}/executor/env.go | 0 .../{ => runner}/executor/env_test.go | 0 .../{ => runner}/executor/executor.go | 56 ++++++++-------- .../{ => runner}/executor/executor_test.go | 4 +- .../internal/{ => runner}/executor/files.go | 8 +-- runner/internal/{ => runner}/executor/lock.go | 0 runner/internal/{ => runner}/executor/logs.go | 2 +- .../internal/{ => runner}/executor/query.go | 2 +- runner/internal/{ => runner}/executor/repo.go | 12 ++-- .../internal/{ => runner}/executor/states.go | 0 .../{ => runner}/executor/timestamp.go | 2 +- .../{ => runner}/executor/timestamp_test.go | 0 runner/internal/{ => runner}/executor/user.go | 6 +- .../{ => runner}/executor/user_test.go | 4 +- .../linux/capabilities/capabilities_darwin.go | 0 .../linux/capabilities/capabilities_linux.go | 0 .../internal/{ => runner}/linux/user/user.go | 0 .../internal/{ => runner}/metrics/cgroups.go | 2 +- .../{ => runner}/metrics/cgroups_test.go | 0 .../internal/{ => runner}/metrics/metrics.go | 20 +++--- .../{ => runner}/metrics/metrics_test.go | 2 +- runner/internal/{ => runner}/repo/diff.go | 2 +- .../internal/{ => runner}/repo/diff_test.go | 0 runner/internal/{ => runner}/repo/manager.go | 2 +- .../internal/{ => runner}/schemas/schemas.go | 14 +++- runner/internal/{ => runner}/ssh/sshd.go | 8 +-- runner/internal/shim/api/handlers.go | 4 +- runner/internal/shim/api/handlers_test.go | 8 +-- runner/internal/shim/api/server.go | 4 +- runner/internal/shim/components/utils.go | 6 +- runner/internal/shim/dcgm/exporter.go | 2 +- runner/internal/shim/docker.go | 36 +++++----- runner/internal/shim/host/gpu.go | 28 ++++---- runner/internal/shim/host/gpu_test.go | 42 ++++++------ runner/internal/shim/host/host.go | 2 +- runner/internal/shim/host_info.go | 20 +++--- runner/internal/shim/resources.go | 26 +++---- runner/internal/shim/resources_test.go | 56 ++++++++-------- runner/internal/shim/task.go | 2 +- 57 files changed, 224 insertions(+), 366 deletions(-) rename runner/internal/{api/common.go => common/api/api.go} (98%) rename runner/{ => internal/common}/consts/consts.go (100%) rename runner/internal/common/{ => gpu}/gpu.go (98%) delete mode 100644 runner/internal/common/interpolator.go delete mode 100644 runner/internal/common/interpolator_test.go rename runner/internal/{ => common}/log/log.go (100%) delete mode 100644 runner/internal/common/string.go rename runner/internal/{ => common}/types/types.go (72%) rename runner/internal/common/{ => utils}/utils.go (95%) rename runner/internal/common/{ => utils}/utils_test.go (99%) rename runner/internal/{ => runner}/connections/connections.go (98%) rename runner/internal/{ => runner}/connections/connections_test.go (100%) rename runner/internal/{ => runner}/executor/base.go (75%) rename runner/internal/{ => runner}/executor/env.go (100%) rename runner/internal/{ => runner}/executor/env_test.go (100%) rename runner/internal/{ => runner}/executor/executor.go (93%) rename runner/internal/{ => runner}/executor/executor_test.go (98%) rename runner/internal/{ => runner}/executor/files.go (91%) rename runner/internal/{ => runner}/executor/lock.go (100%) rename runner/internal/{ => runner}/executor/logs.go (91%) rename runner/internal/{ => runner}/executor/query.go (94%) rename runner/internal/{ => runner}/executor/repo.go (95%) rename runner/internal/{ => runner}/executor/states.go (100%) rename runner/internal/{ => runner}/executor/timestamp.go (95%) rename runner/internal/{ => runner}/executor/timestamp_test.go (100%) rename runner/internal/{ => runner}/executor/user.go (96%) rename runner/internal/{ => runner}/executor/user_test.go (98%) rename runner/internal/{ => runner}/linux/capabilities/capabilities_darwin.go (100%) rename runner/internal/{ => runner}/linux/capabilities/capabilities_linux.go (100%) rename runner/internal/{ => runner}/linux/user/user.go (100%) rename runner/internal/{ => runner}/metrics/cgroups.go (97%) rename runner/internal/{ => runner}/metrics/cgroups_test.go (100%) rename runner/internal/{ => runner}/metrics/metrics.go (95%) rename runner/internal/{ => runner}/metrics/metrics_test.go (95%) rename runner/internal/{ => runner}/repo/diff.go (98%) rename runner/internal/{ => runner}/repo/diff_test.go (100%) rename runner/internal/{ => runner}/repo/manager.go (98%) rename runner/internal/{ => runner}/schemas/schemas.go (92%) rename runner/internal/{ => runner}/ssh/sshd.go (96%) diff --git a/runner/cmd/runner/main.go b/runner/cmd/runner/main.go index 8a62fd6f50..8be7020112 100644 --- a/runner/cmd/runner/main.go +++ b/runner/cmd/runner/main.go @@ -14,12 +14,12 @@ import ( "github.com/sirupsen/logrus" "github.com/urfave/cli/v3" - "github.com/dstackai/dstack/runner/consts" - "github.com/dstackai/dstack/runner/internal/executor" - linuxuser "github.com/dstackai/dstack/runner/internal/linux/user" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/consts" + "github.com/dstackai/dstack/runner/internal/common/log" "github.com/dstackai/dstack/runner/internal/runner/api" - "github.com/dstackai/dstack/runner/internal/ssh" + "github.com/dstackai/dstack/runner/internal/runner/executor" + linuxuser "github.com/dstackai/dstack/runner/internal/runner/linux/user" + "github.com/dstackai/dstack/runner/internal/runner/ssh" ) // Version is a build-time variable. The value is overridden by ldflags. diff --git a/runner/cmd/shim/main.go b/runner/cmd/shim/main.go index 644d7e80e8..c696bd4673 100644 --- a/runner/cmd/shim/main.go +++ b/runner/cmd/shim/main.go @@ -15,9 +15,9 @@ import ( "github.com/sirupsen/logrus" "github.com/urfave/cli/v3" - "github.com/dstackai/dstack/runner/consts" - "github.com/dstackai/dstack/runner/internal/common" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/consts" + "github.com/dstackai/dstack/runner/internal/common/gpu" + "github.com/dstackai/dstack/runner/internal/common/log" "github.com/dstackai/dstack/runner/internal/shim" "github.com/dstackai/dstack/runner/internal/shim/api" "github.com/dstackai/dstack/runner/internal/shim/components" @@ -236,7 +236,7 @@ func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error) var dcgmExporter *dcgm.DCGMExporter var dcgmWrapper dcgm.DCGMWrapperInterface - if common.GetGpuVendor() == common.GpuVendorNvidia { + if gpu.GetGpuVendor() == gpu.GpuVendorNvidia { dcgmExporterPath, err := dcgm.GetDCGMExporterExecPath(ctx) if err == nil { interval := time.Duration(args.DCGMExporter.Interval * int(time.Millisecond)) diff --git a/runner/internal/api/common.go b/runner/internal/common/api/api.go similarity index 98% rename from runner/internal/api/common.go rename to runner/internal/common/api/api.go index 52fa886a0f..85cab57164 100644 --- a/runner/internal/api/common.go +++ b/runner/internal/common/api/api.go @@ -10,7 +10,7 @@ import ( "github.com/golang/gddo/httputil/header" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" ) type Error struct { diff --git a/runner/consts/consts.go b/runner/internal/common/consts/consts.go similarity index 100% rename from runner/consts/consts.go rename to runner/internal/common/consts/consts.go diff --git a/runner/internal/common/gpu.go b/runner/internal/common/gpu/gpu.go similarity index 98% rename from runner/internal/common/gpu.go rename to runner/internal/common/gpu/gpu.go index 045cc773be..72ae83bb56 100644 --- a/runner/internal/common/gpu.go +++ b/runner/internal/common/gpu/gpu.go @@ -1,4 +1,4 @@ -package common +package gpu import ( "errors" diff --git a/runner/internal/common/interpolator.go b/runner/internal/common/interpolator.go deleted file mode 100644 index 84597df7fa..0000000000 --- a/runner/internal/common/interpolator.go +++ /dev/null @@ -1,67 +0,0 @@ -package common - -import ( - "context" - "fmt" - "strings" - - "github.com/dstackai/dstack/runner/internal/log" -) - -const ( - PatternOpening = "${{" - PatternClosing = "}}" -) - -type VariablesInterpolator struct { - Variables map[string]string -} - -func (vi *VariablesInterpolator) Add(namespace string, vars map[string]string) { - if vi.Variables == nil { - vi.Variables = make(map[string]string, len(vars)) - } - for k, v := range vars { - vi.Variables[fmt.Sprintf("%s.%s", namespace, k)] = v - } -} - -func (vi *VariablesInterpolator) Interpolate(ctx context.Context, s string) (string, error) { - log.Trace(ctx, "Interpolating", "s", s) - var sb strings.Builder - - start := 0 - for start < len(s) { - dollar := IndexWithOffset(s, "$", start) - if dollar == -1 || dollar == len(s)-1 { - sb.WriteString(s[start:]) - break - } - if s[dollar+1] == '$' { // $$ = escaped $ - sb.WriteString(s[start : dollar+1]) - start = dollar + 2 - continue - } - - opening := IndexWithOffset(s, PatternOpening, start) - if opening == -1 { - sb.WriteString(s[start:]) - break - } - sb.WriteString(s[start:opening]) - closing := IndexWithOffset(s, PatternClosing, opening) - if closing == -1 { - return "", fmt.Errorf("no pattern closing: %s", s[opening:]) - } - - name := strings.TrimSpace(s[opening+len(PatternOpening) : closing]) - value, ok := vi.Variables[name] - if ok { - sb.WriteString(value) - } else { - log.Warning(ctx, "Variable is missing", "name", name) - } - start = closing + len(PatternClosing) - } - return sb.String(), nil -} diff --git a/runner/internal/common/interpolator_test.go b/runner/internal/common/interpolator_test.go deleted file mode 100644 index e14a248744..0000000000 --- a/runner/internal/common/interpolator_test.go +++ /dev/null @@ -1,64 +0,0 @@ -package common - -import ( - "context" - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestPlainText(t *testing.T) { - var vi VariablesInterpolator - s := "plain text" - result, err := vi.Interpolate(context.Background(), s) - assert.Equal(t, nil, err) - assert.Equal(t, s, result) -} - -func TestMissingVariable(t *testing.T) { - var vi VariablesInterpolator - result, err := vi.Interpolate(context.Background(), "${{ VAR_NAME }} is here") - assert.Equal(t, nil, err) - assert.Equal(t, " is here", result) -} - -func TestDollarEscape(t *testing.T) { - var vi VariablesInterpolator - result, err := vi.Interpolate(context.Background(), "it is not a variable $$!") - assert.Equal(t, nil, err) - assert.Equal(t, "it is not a variable $!", result) -} - -func TestDollarWithoutEscape(t *testing.T) { - var vi VariablesInterpolator - result, err := vi.Interpolate(context.Background(), "it is not a variable $!") - assert.Equal(t, nil, err) - assert.Equal(t, "it is not a variable $!", result) -} - -func TestEscapeOpening(t *testing.T) { - var vi VariablesInterpolator - result, err := vi.Interpolate(context.Background(), "$${{ VAR_NAME }}") - assert.Equal(t, nil, err) - assert.Equal(t, "${{ VAR_NAME }}", result) -} - -func TestWithoutClosing(t *testing.T) { - var vi VariablesInterpolator - _, err := vi.Interpolate(context.Background(), "the end ${{") - assert.NotEqual(t, nil, err) -} - -func TestUnexpectedEOL(t *testing.T) { - var vi VariablesInterpolator - _, err := vi.Interpolate(context.Background(), "the end ${{ VAR }") - assert.NotEqual(t, nil, err) -} - -func TestSecrets(t *testing.T) { - var vi VariablesInterpolator - vi.Add("secrets", map[string]string{"user": "qwerty"}) - result, err := vi.Interpolate(context.Background(), "${{ secrets.user }}") - assert.Equal(t, nil, err) - assert.Equal(t, "qwerty", result) -} diff --git a/runner/internal/log/log.go b/runner/internal/common/log/log.go similarity index 100% rename from runner/internal/log/log.go rename to runner/internal/common/log/log.go diff --git a/runner/internal/common/string.go b/runner/internal/common/string.go deleted file mode 100644 index 28a5ae0756..0000000000 --- a/runner/internal/common/string.go +++ /dev/null @@ -1,11 +0,0 @@ -package common - -import "strings" - -func IndexWithOffset(hay string, needle string, start int) int { - idx := strings.Index(hay[start:], needle) - if idx < 0 { - return -1 - } - return start + idx -} diff --git a/runner/internal/types/types.go b/runner/internal/common/types/types.go similarity index 72% rename from runner/internal/types/types.go rename to runner/internal/common/types/types.go index e8a9519eb4..b7f6c6fd3a 100644 --- a/runner/internal/types/types.go +++ b/runner/internal/common/types/types.go @@ -11,13 +11,3 @@ const ( TerminationReasonTerminatedByServer TerminationReason = "terminated_by_server" TerminationReasonMaxDurationExceeded TerminationReason = "max_duration_exceeded" ) - -type JobState string - -const ( - JobStateDone JobState = "done" - JobStateFailed JobState = "failed" - JobStateRunning JobState = "running" - JobStateTerminated JobState = "terminated" - JobStateTerminating JobState = "terminating" -) diff --git a/runner/internal/common/utils.go b/runner/internal/common/utils/utils.go similarity index 95% rename from runner/internal/common/utils.go rename to runner/internal/common/utils/utils.go index 5be68edf70..5bfc17d867 100644 --- a/runner/internal/common/utils.go +++ b/runner/internal/common/utils/utils.go @@ -1,4 +1,4 @@ -package common +package utils import ( "context" @@ -7,7 +7,7 @@ import ( "path" "slices" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" ) func PathExists(pth string) (bool, error) { diff --git a/runner/internal/common/utils_test.go b/runner/internal/common/utils/utils_test.go similarity index 99% rename from runner/internal/common/utils_test.go rename to runner/internal/common/utils/utils_test.go index 5fe780d503..f38ac57925 100644 --- a/runner/internal/common/utils_test.go +++ b/runner/internal/common/utils/utils_test.go @@ -1,4 +1,4 @@ -package common +package utils import ( "context" diff --git a/runner/internal/runner/api/http.go b/runner/internal/runner/api/http.go index 4d1c7daf54..34220acc6e 100644 --- a/runner/internal/runner/api/http.go +++ b/runner/internal/runner/api/http.go @@ -11,10 +11,10 @@ import ( "net/http" "strconv" - "github.com/dstackai/dstack/runner/internal/api" - "github.com/dstackai/dstack/runner/internal/executor" - "github.com/dstackai/dstack/runner/internal/log" - "github.com/dstackai/dstack/runner/internal/schemas" + "github.com/dstackai/dstack/runner/internal/common/api" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/runner/executor" + "github.com/dstackai/dstack/runner/internal/runner/schemas" ) // TODO: set some reasonable value; (optional) make configurable diff --git a/runner/internal/runner/api/server.go b/runner/internal/runner/api/server.go index ba577d1a5b..11b76d887e 100644 --- a/runner/internal/runner/api/server.go +++ b/runner/internal/runner/api/server.go @@ -7,10 +7,10 @@ import ( _ "net/http/pprof" "time" - "github.com/dstackai/dstack/runner/internal/api" - "github.com/dstackai/dstack/runner/internal/executor" - "github.com/dstackai/dstack/runner/internal/log" - "github.com/dstackai/dstack/runner/internal/metrics" + "github.com/dstackai/dstack/runner/internal/common/api" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/runner/executor" + "github.com/dstackai/dstack/runner/internal/runner/metrics" ) type Server struct { diff --git a/runner/internal/runner/api/ws.go b/runner/internal/runner/api/ws.go index bc6e476c0e..3229701a68 100644 --- a/runner/internal/runner/api/ws.go +++ b/runner/internal/runner/api/ws.go @@ -8,7 +8,7 @@ import ( "github.com/gorilla/websocket" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" ) type logsWsRequestParams struct { diff --git a/runner/internal/connections/connections.go b/runner/internal/runner/connections/connections.go similarity index 98% rename from runner/internal/connections/connections.go rename to runner/internal/runner/connections/connections.go index 37aedad7a2..4a56a6f172 100644 --- a/runner/internal/connections/connections.go +++ b/runner/internal/runner/connections/connections.go @@ -8,7 +8,7 @@ import ( "github.com/prometheus/procfs" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" ) const connStateEstablished = 1 diff --git a/runner/internal/connections/connections_test.go b/runner/internal/runner/connections/connections_test.go similarity index 100% rename from runner/internal/connections/connections_test.go rename to runner/internal/runner/connections/connections_test.go diff --git a/runner/internal/executor/base.go b/runner/internal/runner/executor/base.go similarity index 75% rename from runner/internal/executor/base.go rename to runner/internal/runner/executor/base.go index fac1266fb0..b8093e5e72 100644 --- a/runner/internal/executor/base.go +++ b/runner/internal/runner/executor/base.go @@ -4,8 +4,8 @@ import ( "context" "io" - "github.com/dstackai/dstack/runner/internal/schemas" - "github.com/dstackai/dstack/runner/internal/types" + "github.com/dstackai/dstack/runner/internal/common/types" + "github.com/dstackai/dstack/runner/internal/runner/schemas" ) type Executor interface { @@ -15,10 +15,10 @@ type Executor interface { GetJobInfo(ctx context.Context) (username string, workingDir string, err error) Run(ctx context.Context) error SetJob(job schemas.SubmitBody) - SetJobState(ctx context.Context, state types.JobState) + SetJobState(ctx context.Context, state schemas.JobState) SetJobStateWithTerminationReason( ctx context.Context, - state types.JobState, + state schemas.JobState, terminationReason types.TerminationReason, terminationMessage string, ) diff --git a/runner/internal/executor/env.go b/runner/internal/runner/executor/env.go similarity index 100% rename from runner/internal/executor/env.go rename to runner/internal/runner/executor/env.go diff --git a/runner/internal/executor/env_test.go b/runner/internal/runner/executor/env_test.go similarity index 100% rename from runner/internal/executor/env_test.go rename to runner/internal/runner/executor/env_test.go diff --git a/runner/internal/executor/executor.go b/runner/internal/runner/executor/executor.go similarity index 93% rename from runner/internal/executor/executor.go rename to runner/internal/runner/executor/executor.go index 311eddaa10..98289eb4ec 100644 --- a/runner/internal/executor/executor.go +++ b/runner/internal/runner/executor/executor.go @@ -24,15 +24,15 @@ import ( "github.com/sirupsen/logrus" "golang.org/x/sys/unix" - "github.com/dstackai/dstack/runner/consts" - "github.com/dstackai/dstack/runner/internal/common" - "github.com/dstackai/dstack/runner/internal/connections" - cap "github.com/dstackai/dstack/runner/internal/linux/capabilities" - linuxuser "github.com/dstackai/dstack/runner/internal/linux/user" - "github.com/dstackai/dstack/runner/internal/log" - "github.com/dstackai/dstack/runner/internal/schemas" - "github.com/dstackai/dstack/runner/internal/ssh" - "github.com/dstackai/dstack/runner/internal/types" + "github.com/dstackai/dstack/runner/internal/common/consts" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/common/types" + "github.com/dstackai/dstack/runner/internal/common/utils" + "github.com/dstackai/dstack/runner/internal/runner/connections" + cap "github.com/dstackai/dstack/runner/internal/runner/linux/capabilities" + linuxuser "github.com/dstackai/dstack/runner/internal/runner/linux/user" + "github.com/dstackai/dstack/runner/internal/runner/schemas" + "github.com/dstackai/dstack/runner/internal/runner/ssh" ) // TODO: Tune these parameters for optimal experience/performance @@ -164,7 +164,7 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { jobLogFile, err := log.CreateAppendFile(filepath.Join(ex.tempDir, consts.RunnerJobLogFileName)) if err != nil { - ex.SetJobState(ctx, types.JobStateFailed) + ex.SetJobState(ctx, schemas.JobStateFailed) return fmt.Errorf("create job log file: %w", err) } defer func() { _ = jobLogFile.Close() }() @@ -173,7 +173,7 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { // recover goes after postRun(), which closes runnerLogFile, to keep the log if r := recover(); r != nil { log.Error(ctx, "Executor PANIC", "err", r) - ex.SetJobState(ctx, types.JobStateFailed) + ex.SetJobState(ctx, schemas.JobStateFailed) err = fmt.Errorf("recovered: %v", r) } // no more logs will be written after this @@ -211,7 +211,7 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { if err := ex.setupRepo(ctx); err != nil { ex.SetJobStateWithTerminationReason( ctx, - types.JobStateFailed, + schemas.JobStateFailed, types.TerminationReasonContainerExitedWithError, fmt.Sprintf("Failed to set up the repo (%s)", err), ) @@ -221,7 +221,7 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { if err := ex.setupFiles(ctx); err != nil { ex.SetJobStateWithTerminationReason( ctx, - types.JobStateFailed, + schemas.JobStateFailed, types.TerminationReasonExecutorError, fmt.Sprintf("Failed to set up files (%s)", err), ) @@ -232,7 +232,7 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { go ex.connectionTracker.Track(connectionTrackerTicker.C) defer ex.connectionTracker.Stop() - ex.SetJobState(ctx, types.JobStateRunning) + ex.SetJobState(ctx, schemas.JobStateRunning) timeoutCtx := ctx var cancelTimeout context.CancelFunc if ex.jobSpec.MaxDuration != 0 { @@ -243,7 +243,7 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { select { case <-ctx.Done(): log.Error(ctx, "Job canceled") - ex.SetJobState(ctx, types.JobStateTerminated) + ex.SetJobState(ctx, schemas.JobStateTerminated) return fmt.Errorf("job canceled: %w", err) default: } @@ -253,7 +253,7 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { log.Error(ctx, "Max duration exceeded", "max_duration", ex.jobSpec.MaxDuration) ex.SetJobStateWithTerminationReason( ctx, - types.JobStateTerminated, + schemas.JobStateTerminated, types.TerminationReasonMaxDurationExceeded, "Max duration exceeded", ) @@ -265,14 +265,14 @@ func (ex *RunExecutor) Run(ctx context.Context) (err error) { log.Error(ctx, "Exec failed", "err", err) var exitError *exec.ExitError if errors.As(err, &exitError) { - ex.SetJobStateWithExitStatus(ctx, types.JobStateFailed, exitError.ExitCode()) + ex.SetJobStateWithExitStatus(ctx, schemas.JobStateFailed, exitError.ExitCode()) } else { - ex.SetJobState(ctx, types.JobStateFailed) + ex.SetJobState(ctx, schemas.JobStateFailed) } return fmt.Errorf("exec job failed: %w", err) } - ex.SetJobStateWithExitStatus(ctx, types.JobStateDone, 0) + ex.SetJobStateWithExitStatus(ctx, schemas.JobStateDone, 0) return nil } @@ -286,12 +286,12 @@ func (ex *RunExecutor) SetJob(body schemas.SubmitBody) { ex.state = WaitCode } -func (ex *RunExecutor) SetJobState(ctx context.Context, state types.JobState) { +func (ex *RunExecutor) SetJobState(ctx context.Context, state schemas.JobState) { ex.SetJobStateWithTerminationReason(ctx, state, "", "") } func (ex *RunExecutor) SetJobStateWithTerminationReason( - ctx context.Context, state types.JobState, terminationReason types.TerminationReason, terminationMessage string, + ctx context.Context, state schemas.JobState, terminationReason types.TerminationReason, terminationMessage string, ) { ex.mu.Lock() ex.jobStateHistory = append( @@ -311,7 +311,7 @@ func (ex *RunExecutor) SetJobStateWithTerminationReason( } func (ex *RunExecutor) SetJobStateWithExitStatus( - ctx context.Context, state types.JobState, exitStatus int, + ctx context.Context, state schemas.JobState, exitStatus int, ) { ex.mu.Lock() ex.jobStateHistory = append( @@ -343,7 +343,7 @@ func (ex *RunExecutor) preRun(ctx context.Context) error { // logging is required for the subsequent setJob{User,WorkingDir} calls runnerLogFile, err := log.CreateAppendFile(filepath.Join(ex.tempDir, consts.RunnerLogFileName)) if err != nil { - ex.SetJobState(ctx, types.JobStateFailed) + ex.SetJobState(ctx, schemas.JobStateFailed) return fmt.Errorf("create runner log file: %w", err) } ex.runnerLogFile = runnerLogFile @@ -358,7 +358,7 @@ func (ex *RunExecutor) preRun(ctx context.Context) error { if err := ex.setJobUser(ctx); err != nil { ex.SetJobStateWithTerminationReason( ctx, - types.JobStateFailed, + schemas.JobStateFailed, types.TerminationReasonExecutorError, fmt.Sprintf("Failed to set job user (%s)", err), ) @@ -367,7 +367,7 @@ func (ex *RunExecutor) preRun(ctx context.Context) error { if err := ex.setJobWorkingDir(ctx); err != nil { ex.SetJobStateWithTerminationReason( ctx, - types.JobStateFailed, + schemas.JobStateFailed, types.TerminationReasonExecutorError, fmt.Sprintf("Failed to set job working dir (%s)", err), ) @@ -399,7 +399,7 @@ func (ex *RunExecutor) setJobWorkingDir(ctx context.Context) error { return fmt.Errorf("get working directory: %w", err) } } else { - ex.jobWorkingDir, err = common.ExpandPath(*ex.jobSpec.WorkingDir, "", ex.jobUser.HomeDir) + ex.jobWorkingDir, err = utils.ExpandPath(*ex.jobSpec.WorkingDir, "", ex.jobUser.HomeDir) if err != nil { return fmt.Errorf("expand working dir path: %w", err) } @@ -508,7 +508,7 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error } cmd.WaitDelay = ex.killDelay // kills the process if it doesn't exit in time - if err := common.MkdirAll(ctx, ex.jobWorkingDir, ex.jobUser.Uid, ex.jobUser.Gid, 0o755); err != nil { + if err := utils.MkdirAll(ctx, ex.jobWorkingDir, ex.jobUser.Uid, ex.jobUser.Gid, 0o755); err != nil { return fmt.Errorf("create working directory: %w", err) } cmd.Dir = ex.jobWorkingDir @@ -636,7 +636,7 @@ func (ex *RunExecutor) setupGitCredentials(ctx context.Context) (func(), error) if _, err := os.Stat(hostsPath); err == nil { return nil, fmt.Errorf("hosts.yml file already exists") } - if err := common.MkdirAll(ctx, filepath.Dir(hostsPath), ex.jobUser.Uid, ex.jobUser.Gid, 0o700); err != nil { + if err := utils.MkdirAll(ctx, filepath.Dir(hostsPath), ex.jobUser.Uid, ex.jobUser.Gid, 0o700); err != nil { return nil, fmt.Errorf("create gh config directory: %w", err) } log.Info(ctx, "Writing OAuth token", "path", hostsPath) diff --git a/runner/internal/executor/executor_test.go b/runner/internal/runner/executor/executor_test.go similarity index 98% rename from runner/internal/executor/executor_test.go rename to runner/internal/runner/executor/executor_test.go index 105493e301..915cca35a6 100644 --- a/runner/internal/executor/executor_test.go +++ b/runner/internal/runner/executor/executor_test.go @@ -14,8 +14,8 @@ import ( "testing" "time" - linuxuser "github.com/dstackai/dstack/runner/internal/linux/user" - "github.com/dstackai/dstack/runner/internal/schemas" + linuxuser "github.com/dstackai/dstack/runner/internal/runner/linux/user" + "github.com/dstackai/dstack/runner/internal/runner/schemas" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) diff --git a/runner/internal/executor/files.go b/runner/internal/runner/executor/files.go similarity index 91% rename from runner/internal/executor/files.go rename to runner/internal/runner/executor/files.go index 6b992ce2c1..f61b9e2429 100644 --- a/runner/internal/executor/files.go +++ b/runner/internal/runner/executor/files.go @@ -12,8 +12,8 @@ import ( "github.com/codeclysm/extract/v4" - "github.com/dstackai/dstack/runner/internal/common" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/common/utils" ) var renameRegex = regexp.MustCompile(`^([^/]*)(/|$)`) @@ -62,12 +62,12 @@ func (ex *RunExecutor) setupFiles(ctx context.Context) error { func extractFileArchive(ctx context.Context, archivePath string, destPath string, baseDir string, homeDir string, uid int, gid int) error { log.Trace(ctx, "Extracting file archive", "archive", archivePath, "dest", destPath, "base", baseDir, "home", homeDir) - destPath, err := common.ExpandPath(destPath, baseDir, homeDir) + destPath, err := utils.ExpandPath(destPath, baseDir, homeDir) if err != nil { return fmt.Errorf("expand destination path: %w", err) } destBase, destName := path.Split(destPath) - if err := common.MkdirAll(ctx, destBase, uid, gid, 0o755); err != nil { + if err := utils.MkdirAll(ctx, destBase, uid, gid, 0o755); err != nil { return fmt.Errorf("create destination directory: %w", err) } if err := os.RemoveAll(destPath); err != nil { diff --git a/runner/internal/executor/lock.go b/runner/internal/runner/executor/lock.go similarity index 100% rename from runner/internal/executor/lock.go rename to runner/internal/runner/executor/lock.go diff --git a/runner/internal/executor/logs.go b/runner/internal/runner/executor/logs.go similarity index 91% rename from runner/internal/executor/logs.go rename to runner/internal/runner/executor/logs.go index 807071eeb9..808fc84b1d 100644 --- a/runner/internal/executor/logs.go +++ b/runner/internal/runner/executor/logs.go @@ -3,7 +3,7 @@ package executor import ( "sync" - "github.com/dstackai/dstack/runner/internal/schemas" + "github.com/dstackai/dstack/runner/internal/runner/schemas" ) type appendWriter struct { diff --git a/runner/internal/executor/query.go b/runner/internal/runner/executor/query.go similarity index 94% rename from runner/internal/executor/query.go rename to runner/internal/runner/executor/query.go index 6678e5f8d7..f3acbf20ac 100644 --- a/runner/internal/executor/query.go +++ b/runner/internal/runner/executor/query.go @@ -1,7 +1,7 @@ package executor import ( - "github.com/dstackai/dstack/runner/internal/schemas" + "github.com/dstackai/dstack/runner/internal/runner/schemas" ) func (ex *RunExecutor) GetJobWsLogsHistory() []schemas.LogEvent { diff --git a/runner/internal/executor/repo.go b/runner/internal/runner/executor/repo.go similarity index 95% rename from runner/internal/executor/repo.go rename to runner/internal/runner/executor/repo.go index dd16092be9..116e4b225d 100644 --- a/runner/internal/executor/repo.go +++ b/runner/internal/runner/executor/repo.go @@ -13,10 +13,10 @@ import ( "github.com/codeclysm/extract/v4" - "github.com/dstackai/dstack/runner/internal/common" - "github.com/dstackai/dstack/runner/internal/log" - "github.com/dstackai/dstack/runner/internal/repo" - "github.com/dstackai/dstack/runner/internal/schemas" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/common/utils" + "github.com/dstackai/dstack/runner/internal/runner/repo" + "github.com/dstackai/dstack/runner/internal/runner/schemas" ) // WriteRepoBlob must be called after SetJob @@ -50,7 +50,7 @@ func (ex *RunExecutor) setupRepo(ctx context.Context) error { } var err error - ex.repoDir, err = common.ExpandPath(*ex.jobSpec.RepoDir, ex.jobWorkingDir, ex.jobUser.HomeDir) + ex.repoDir, err = utils.ExpandPath(*ex.jobSpec.RepoDir, ex.jobWorkingDir, ex.jobUser.HomeDir) if err != nil { return fmt.Errorf("expand repo dir path: %w", err) } @@ -236,7 +236,7 @@ func (ex *RunExecutor) restoreRepoDir(ctx context.Context, tmpDir string) error func (ex *RunExecutor) chownRepoDir(ctx context.Context) error { log.Trace(ctx, "Chowning repo dir") - exists, err := common.PathExists(ex.repoDir) + exists, err := utils.PathExists(ex.repoDir) // We consider all errors here non-fatal if err != nil { log.Warning(ctx, "Failed to check if repo dir exists", "err", err) diff --git a/runner/internal/executor/states.go b/runner/internal/runner/executor/states.go similarity index 100% rename from runner/internal/executor/states.go rename to runner/internal/runner/executor/states.go diff --git a/runner/internal/executor/timestamp.go b/runner/internal/runner/executor/timestamp.go similarity index 95% rename from runner/internal/executor/timestamp.go rename to runner/internal/runner/executor/timestamp.go index b1cf0fa2cc..b06d8cf47e 100644 --- a/runner/internal/executor/timestamp.go +++ b/runner/internal/runner/executor/timestamp.go @@ -5,7 +5,7 @@ import ( "sync" "time" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" ) type MonotonicTimestamp struct { diff --git a/runner/internal/executor/timestamp_test.go b/runner/internal/runner/executor/timestamp_test.go similarity index 100% rename from runner/internal/executor/timestamp_test.go rename to runner/internal/runner/executor/timestamp_test.go diff --git a/runner/internal/executor/user.go b/runner/internal/runner/executor/user.go similarity index 96% rename from runner/internal/executor/user.go rename to runner/internal/runner/executor/user.go index 30affda617..df9f0fe45c 100644 --- a/runner/internal/executor/user.go +++ b/runner/internal/runner/executor/user.go @@ -10,9 +10,9 @@ import ( "strconv" "strings" - linuxuser "github.com/dstackai/dstack/runner/internal/linux/user" - "github.com/dstackai/dstack/runner/internal/log" - "github.com/dstackai/dstack/runner/internal/schemas" + "github.com/dstackai/dstack/runner/internal/common/log" + linuxuser "github.com/dstackai/dstack/runner/internal/runner/linux/user" + "github.com/dstackai/dstack/runner/internal/runner/schemas" ) func (ex *RunExecutor) setJobUser(ctx context.Context) error { diff --git a/runner/internal/executor/user_test.go b/runner/internal/runner/executor/user_test.go similarity index 98% rename from runner/internal/executor/user_test.go rename to runner/internal/runner/executor/user_test.go index 2bc6a19d87..c0fc202f2e 100644 --- a/runner/internal/executor/user_test.go +++ b/runner/internal/runner/executor/user_test.go @@ -8,8 +8,8 @@ import ( "github.com/stretchr/testify/require" - linuxuser "github.com/dstackai/dstack/runner/internal/linux/user" - "github.com/dstackai/dstack/runner/internal/schemas" + linuxuser "github.com/dstackai/dstack/runner/internal/runner/linux/user" + "github.com/dstackai/dstack/runner/internal/runner/schemas" ) var shouldNotBeCalledErr = errors.New("this function should not be called") diff --git a/runner/internal/linux/capabilities/capabilities_darwin.go b/runner/internal/runner/linux/capabilities/capabilities_darwin.go similarity index 100% rename from runner/internal/linux/capabilities/capabilities_darwin.go rename to runner/internal/runner/linux/capabilities/capabilities_darwin.go diff --git a/runner/internal/linux/capabilities/capabilities_linux.go b/runner/internal/runner/linux/capabilities/capabilities_linux.go similarity index 100% rename from runner/internal/linux/capabilities/capabilities_linux.go rename to runner/internal/runner/linux/capabilities/capabilities_linux.go diff --git a/runner/internal/linux/user/user.go b/runner/internal/runner/linux/user/user.go similarity index 100% rename from runner/internal/linux/user/user.go rename to runner/internal/runner/linux/user/user.go diff --git a/runner/internal/metrics/cgroups.go b/runner/internal/runner/metrics/cgroups.go similarity index 97% rename from runner/internal/metrics/cgroups.go rename to runner/internal/runner/metrics/cgroups.go index 9ce1e54fe6..7ac89db4a1 100644 --- a/runner/internal/metrics/cgroups.go +++ b/runner/internal/runner/metrics/cgroups.go @@ -8,7 +8,7 @@ import ( "os" "strings" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" ) func getProcessCgroupMountPoint(ctx context.Context, ProcPidMountsPath string) (string, error) { diff --git a/runner/internal/metrics/cgroups_test.go b/runner/internal/runner/metrics/cgroups_test.go similarity index 100% rename from runner/internal/metrics/cgroups_test.go rename to runner/internal/runner/metrics/cgroups_test.go diff --git a/runner/internal/metrics/metrics.go b/runner/internal/runner/metrics/metrics.go similarity index 95% rename from runner/internal/metrics/metrics.go rename to runner/internal/runner/metrics/metrics.go index 26acc2cdf4..56c27a2bb1 100644 --- a/runner/internal/metrics/metrics.go +++ b/runner/internal/runner/metrics/metrics.go @@ -12,14 +12,14 @@ import ( "strings" "time" - "github.com/dstackai/dstack/runner/internal/common" - "github.com/dstackai/dstack/runner/internal/log" - "github.com/dstackai/dstack/runner/internal/schemas" + "github.com/dstackai/dstack/runner/internal/common/gpu" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/runner/schemas" ) type MetricsCollector struct { cgroupMountPoint string - gpuVendor common.GpuVendor + gpuVendor gpu.GpuVendor } func NewMetricsCollector(ctx context.Context) (*MetricsCollector, error) { @@ -29,7 +29,7 @@ func NewMetricsCollector(ctx context.Context) (*MetricsCollector, error) { if err != nil { return nil, fmt.Errorf("get cgroup mount point: %w", err) } - gpuVendor := common.GetGpuVendor() + gpuVendor := gpu.GetGpuVendor() return &MetricsCollector{ cgroupMountPoint: cgroupMountPoint, gpuVendor: gpuVendor, @@ -141,15 +141,15 @@ func (s *MetricsCollector) GetGPUMetrics(ctx context.Context) ([]schemas.GPUMetr var metrics []schemas.GPUMetrics var err error switch s.gpuVendor { - case common.GpuVendorNvidia: + case gpu.GpuVendorNvidia: metrics, err = s.GetNVIDIAGPUMetrics(ctx) - case common.GpuVendorAmd: + case gpu.GpuVendorAmd: metrics, err = s.GetAMDGPUMetrics(ctx) - case common.GpuVendorIntel: + case gpu.GpuVendorIntel: metrics, err = s.GetIntelAcceleratorMetrics(ctx) - case common.GpuVendorTenstorrent: + case gpu.GpuVendorTenstorrent: err = errors.New("tenstorrent metrics not suppored") - case common.GpuVendorNone: + case gpu.GpuVendorNone: // pass } if metrics == nil { diff --git a/runner/internal/metrics/metrics_test.go b/runner/internal/runner/metrics/metrics_test.go similarity index 95% rename from runner/internal/metrics/metrics_test.go rename to runner/internal/runner/metrics/metrics_test.go index 152f31c1b7..3410435ce2 100644 --- a/runner/internal/metrics/metrics_test.go +++ b/runner/internal/runner/metrics/metrics_test.go @@ -4,7 +4,7 @@ import ( "runtime" "testing" - "github.com/dstackai/dstack/runner/internal/schemas" + "github.com/dstackai/dstack/runner/internal/runner/schemas" "github.com/stretchr/testify/assert" ) diff --git a/runner/internal/repo/diff.go b/runner/internal/runner/repo/diff.go similarity index 98% rename from runner/internal/repo/diff.go rename to runner/internal/runner/repo/diff.go index 43e6b2e20f..a7f33cad6c 100644 --- a/runner/internal/repo/diff.go +++ b/runner/internal/runner/repo/diff.go @@ -12,7 +12,7 @@ import ( "github.com/bluekeyes/go-gitdiff/gitdiff" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" ) func ApplyDiff(ctx context.Context, dir, patch string) error { diff --git a/runner/internal/repo/diff_test.go b/runner/internal/runner/repo/diff_test.go similarity index 100% rename from runner/internal/repo/diff_test.go rename to runner/internal/runner/repo/diff_test.go diff --git a/runner/internal/repo/manager.go b/runner/internal/runner/repo/manager.go similarity index 98% rename from runner/internal/repo/manager.go rename to runner/internal/runner/repo/manager.go index 6e546a0886..baeec40fad 100644 --- a/runner/internal/repo/manager.go +++ b/runner/internal/runner/repo/manager.go @@ -10,7 +10,7 @@ import ( gitssh "github.com/go-git/go-git/v5/plumbing/transport/ssh" "golang.org/x/crypto/ssh" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" ) type Manager struct { diff --git a/runner/internal/schemas/schemas.go b/runner/internal/runner/schemas/schemas.go similarity index 92% rename from runner/internal/schemas/schemas.go rename to runner/internal/runner/schemas/schemas.go index 10ab62ea95..ca707db761 100644 --- a/runner/internal/schemas/schemas.go +++ b/runner/internal/runner/schemas/schemas.go @@ -3,11 +3,21 @@ package schemas import ( "strings" - "github.com/dstackai/dstack/runner/internal/types" + "github.com/dstackai/dstack/runner/internal/common/types" +) + +type JobState string + +const ( + JobStateDone JobState = "done" + JobStateFailed JobState = "failed" + JobStateRunning JobState = "running" + JobStateTerminated JobState = "terminated" + JobStateTerminating JobState = "terminating" ) type JobStateEvent struct { - State types.JobState `json:"state"` + State JobState `json:"state"` Timestamp int64 `json:"timestamp"` TerminationReason types.TerminationReason `json:"termination_reason"` TerminationMessage string `json:"termination_message"` diff --git a/runner/internal/ssh/sshd.go b/runner/internal/runner/ssh/sshd.go similarity index 96% rename from runner/internal/ssh/sshd.go rename to runner/internal/runner/ssh/sshd.go index d46be7e24f..05da8d1401 100644 --- a/runner/internal/ssh/sshd.go +++ b/runner/internal/runner/ssh/sshd.go @@ -11,8 +11,8 @@ import ( "syscall" "time" - "github.com/dstackai/dstack/runner/internal/common" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/common/utils" ) type SshdManager interface { @@ -203,7 +203,7 @@ func copyHostKey(srcDir string, destDir string, key string) error { func prepareAuthorizedKeysFile(confDir string) (string, error) { // Ensures that the file exists, has correct ownership and permissions, and is empty akPath := path.Join(confDir, "authorized_keys") - if _, err := common.RemoveIfExists(akPath); err != nil { + if _, err := utils.RemoveIfExists(akPath); err != nil { return "", err } file, err := os.OpenFile(akPath, os.O_CREATE|os.O_EXCL|os.O_RDONLY, 0o644) @@ -268,7 +268,7 @@ func prepareLogPath(logDir string) (string, error) { return "", err } logPath := path.Join(logDir, "sshd.log") - if _, err := common.RemoveIfExists(logPath); err != nil { + if _, err := utils.RemoveIfExists(logPath); err != nil { return "", err } return logPath, nil diff --git a/runner/internal/shim/api/handlers.go b/runner/internal/shim/api/handlers.go index dc1be824cb..b3382d0f26 100644 --- a/runner/internal/shim/api/handlers.go +++ b/runner/internal/shim/api/handlers.go @@ -5,8 +5,8 @@ import ( "errors" "net/http" - "github.com/dstackai/dstack/runner/internal/api" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/api" + "github.com/dstackai/dstack/runner/internal/common/log" "github.com/dstackai/dstack/runner/internal/shim" "github.com/dstackai/dstack/runner/internal/shim/components" "github.com/dstackai/dstack/runner/internal/shim/dcgm" diff --git a/runner/internal/shim/api/handlers_test.go b/runner/internal/shim/api/handlers_test.go index 9bc829a94c..bb19ebbf1b 100644 --- a/runner/internal/shim/api/handlers_test.go +++ b/runner/internal/shim/api/handlers_test.go @@ -6,7 +6,7 @@ import ( "strings" "testing" - common "github.com/dstackai/dstack/runner/internal/api" + commonapi "github.com/dstackai/dstack/runner/internal/common/api" ) func TestHealthcheck(t *testing.T) { @@ -15,7 +15,7 @@ func TestHealthcheck(t *testing.T) { server := NewShimServer(context.Background(), ":12345", "0.0.1.dev2", NewDummyRunner(), nil, nil, nil, nil) - f := common.JSONResponseHandler(server.HealthcheckHandler) + f := commonapi.JSONResponseHandler(server.HealthcheckHandler) f(responseRecorder, request) if responseRecorder.Code != 200 { @@ -39,7 +39,7 @@ func TestTaskSubmit(t *testing.T) { request := httptest.NewRequest("POST", "/api/tasks", strings.NewReader(requestBody)) responseRecorder := httptest.NewRecorder() - firstSubmitPost := common.JSONResponseHandler(server.TaskSubmitHandler) + firstSubmitPost := commonapi.JSONResponseHandler(server.TaskSubmitHandler) firstSubmitPost(responseRecorder, request) if responseRecorder.Code != 200 { t.Errorf("Want status '%d', got '%d'", 200, responseRecorder.Code) @@ -47,7 +47,7 @@ func TestTaskSubmit(t *testing.T) { request = httptest.NewRequest("POST", "/api/tasks", strings.NewReader(requestBody)) responseRecorder = httptest.NewRecorder() - secondSubmitPost := common.JSONResponseHandler(server.TaskSubmitHandler) + secondSubmitPost := commonapi.JSONResponseHandler(server.TaskSubmitHandler) secondSubmitPost(responseRecorder, request) if responseRecorder.Code != 409 { t.Errorf("Want status '%d', got '%d'", 409, responseRecorder.Code) diff --git a/runner/internal/shim/api/server.go b/runner/internal/shim/api/server.go index 0482db7945..9008aa2efe 100644 --- a/runner/internal/shim/api/server.go +++ b/runner/internal/shim/api/server.go @@ -8,8 +8,8 @@ import ( "reflect" "sync" - "github.com/dstackai/dstack/runner/internal/api" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/api" + "github.com/dstackai/dstack/runner/internal/common/log" "github.com/dstackai/dstack/runner/internal/shim" "github.com/dstackai/dstack/runner/internal/shim/components" "github.com/dstackai/dstack/runner/internal/shim/dcgm" diff --git a/runner/internal/shim/components/utils.go b/runner/internal/shim/components/utils.go index 073832133d..a4456acaa3 100644 --- a/runner/internal/shim/components/utils.go +++ b/runner/internal/shim/components/utils.go @@ -12,8 +12,8 @@ import ( "strings" "time" - "github.com/dstackai/dstack/runner/internal/common" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/common/utils" ) const downloadTimeout = 10 * time.Minute @@ -90,7 +90,7 @@ func downloadFile(ctx context.Context, url string, path string, mode os.FileMode } func checkDstackComponent(ctx context.Context, name ComponentName, pth string) (status ComponentStatus, version string, err error) { - exists, err := common.PathExists(pth) + exists, err := utils.PathExists(pth) if err != nil { return ComponentStatusError, "", fmt.Errorf("check %s: %w", name, err) } diff --git a/runner/internal/shim/dcgm/exporter.go b/runner/internal/shim/dcgm/exporter.go index f49fb91aee..ed861eb524 100644 --- a/runner/internal/shim/dcgm/exporter.go +++ b/runner/internal/shim/dcgm/exporter.go @@ -17,7 +17,7 @@ import ( "github.com/alexellis/go-execute/v2" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" ) // Counter represents a single line in counters.csv, see diff --git a/runner/internal/shim/docker.go b/runner/internal/shim/docker.go index 88a7f37c02..6acfb27a51 100644 --- a/runner/internal/shim/docker.go +++ b/runner/internal/shim/docker.go @@ -31,12 +31,12 @@ import ( "github.com/docker/go-units" bytesize "github.com/inhies/go-bytesize" - "github.com/dstackai/dstack/runner/consts" - "github.com/dstackai/dstack/runner/internal/common" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/consts" + "github.com/dstackai/dstack/runner/internal/common/gpu" + "github.com/dstackai/dstack/runner/internal/common/log" + "github.com/dstackai/dstack/runner/internal/common/types" "github.com/dstackai/dstack/runner/internal/shim/backends" "github.com/dstackai/dstack/runner/internal/shim/host" - "github.com/dstackai/dstack/runner/internal/types" ) // TODO: Allow for configuration via cli arguments or environment variables. @@ -55,7 +55,7 @@ type DockerRunner struct { dockerParams DockerParameters dockerInfo dockersystem.Info gpus []host.GpuInfo - gpuVendor common.GpuVendor + gpuVendor gpu.GpuVendor gpuLock *GpuLock tasks TaskStorage } @@ -70,12 +70,12 @@ func NewDockerRunner(ctx context.Context, dockerParams DockerParameters) (*Docke return nil, fmt.Errorf("get docker info: %w", err) } - var gpuVendor common.GpuVendor + var gpuVendor gpu.GpuVendor gpus := host.GetGpuInfo(ctx) if len(gpus) > 0 { gpuVendor = gpus[0].Vendor } else { - gpuVendor = common.GpuVendorNone + gpuVendor = gpu.GpuVendorNone } gpuLock, err := NewGpuLock(gpus) if err != nil { @@ -135,7 +135,7 @@ func (d *DockerRunner) restoreStateFromContainers(ctx context.Context) error { log.Error(ctx, "failed to inspect container", "id", containerID, "task", taskID) } else { switch d.gpuVendor { - case common.GpuVendorNvidia: + case gpu.GpuVendorNvidia: deviceRequests := containerFull.HostConfig.DeviceRequests if len(deviceRequests) == 1 { gpuIDs = deviceRequests[0].DeviceIDs @@ -146,13 +146,13 @@ func (d *DockerRunner) restoreStateFromContainers(ctx context.Context) error { "id", containerID, "task", taskID, ) } - case common.GpuVendorAmd: + case gpu.GpuVendorAmd: for _, device := range containerFull.HostConfig.Devices { if host.IsRenderNodePath(device.PathOnHost) { gpuIDs = append(gpuIDs, device.PathOnHost) } } - case common.GpuVendorTenstorrent: + case gpu.GpuVendorTenstorrent: for _, device := range containerFull.HostConfig.Devices { if strings.HasPrefix(device.PathOnHost, "/dev/tenstorrent/") { // Extract the device ID from the path @@ -160,14 +160,14 @@ func (d *DockerRunner) restoreStateFromContainers(ctx context.Context) error { gpuIDs = append(gpuIDs, deviceID) } } - case common.GpuVendorIntel: + case gpu.GpuVendorIntel: for _, envVar := range containerFull.Config.Env { if indices, found := strings.CutPrefix(envVar, "HABANA_VISIBLE_DEVICES="); found { gpuIDs = strings.Split(indices, ",") break } } - case common.GpuVendorNone: + case gpu.GpuVendorNone: gpuIDs = []string{} } ports = extractPorts(ctx, containerFull.NetworkSettings.Ports) @@ -1024,12 +1024,12 @@ func configureGpuDevices(hostConfig *container.HostConfig, gpuDevices []GPUDevic } } -func configureGpus(config *container.Config, hostConfig *container.HostConfig, vendor common.GpuVendor, ids []string) { +func configureGpus(config *container.Config, hostConfig *container.HostConfig, vendor gpu.GpuVendor, ids []string) { // NVIDIA: ids are identifiers reported by nvidia-smi, GPU- strings // AMD: ids are DRI render node paths, e.g., /dev/dri/renderD128 // Tenstorrent: ids are device indices to be used with /dev/tenstorrent/ switch vendor { - case common.GpuVendorNvidia: + case gpu.GpuVendorNvidia: hostConfig.DeviceRequests = append( hostConfig.DeviceRequests, container.DeviceRequest{ @@ -1040,7 +1040,7 @@ func configureGpus(config *container.Config, hostConfig *container.HostConfig, v DeviceIDs: ids, }, ) - case common.GpuVendorAmd: + case gpu.GpuVendorAmd: // All options are listed here: https://hub.docker.com/r/rocm/pytorch // Only --device are mandatory, other seem to be performance-related. // --device=/dev/kfd @@ -1070,7 +1070,7 @@ func configureGpus(config *container.Config, hostConfig *container.HostConfig, v // --security-opt=seccomp=unconfined hostConfig.SecurityOpt = append(hostConfig.SecurityOpt, "seccomp=unconfined") // TODO: in addition, for non-root user, --group-add=video, and possibly --group-add=render, are required. - case common.GpuVendorTenstorrent: + case gpu.GpuVendorTenstorrent: // For Tenstorrent, simply add each device for _, id := range ids { devicePath := fmt.Sprintf("/dev/tenstorrent/%s", id) @@ -1091,7 +1091,7 @@ func configureGpus(config *container.Config, hostConfig *container.HostConfig, v Target: "/dev/hugepages-1G", }) } - case common.GpuVendorIntel: + case gpu.GpuVendorIntel: // All options are listed here: // https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html // --runtime=habana @@ -1102,7 +1102,7 @@ func configureGpus(config *container.Config, hostConfig *container.HostConfig, v hostConfig.CapAdd = append(hostConfig.CapAdd, "SYS_NICE") // -e HABANA_VISIBLE_DEVICES=0,1,... config.Env = append(config.Env, fmt.Sprintf("HABANA_VISIBLE_DEVICES=%s", strings.Join(ids, ","))) - case common.GpuVendorNone: + case gpu.GpuVendorNone: // nothing to do } } diff --git a/runner/internal/shim/host/gpu.go b/runner/internal/shim/host/gpu.go index b2b2135efc..0452f1ff46 100644 --- a/runner/internal/shim/host/gpu.go +++ b/runner/internal/shim/host/gpu.go @@ -13,8 +13,8 @@ import ( execute "github.com/alexellis/go-execute/v2" - "github.com/dstackai/dstack/runner/internal/common" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/gpu" + "github.com/dstackai/dstack/runner/internal/common/log" ) const ( @@ -23,7 +23,7 @@ const ( ) type GpuInfo struct { - Vendor common.GpuVendor + Vendor gpu.GpuVendor Name string Vram int // MiB // NVIDIA: uuid field from nvidia-smi, "globally unique immutable alphanumeric identifier of the GPU", @@ -43,16 +43,16 @@ type GpuInfo struct { } func GetGpuInfo(ctx context.Context) []GpuInfo { - switch gpuVendor := common.GetGpuVendor(); gpuVendor { - case common.GpuVendorNvidia: + switch gpuVendor := gpu.GetGpuVendor(); gpuVendor { + case gpu.GpuVendorNvidia: return getNvidiaGpuInfo(ctx) - case common.GpuVendorAmd: + case gpu.GpuVendorAmd: return getAmdGpuInfo(ctx) - case common.GpuVendorIntel: + case gpu.GpuVendorIntel: return getIntelGpuInfo(ctx) - case common.GpuVendorTenstorrent: + case gpu.GpuVendorTenstorrent: return getTenstorrentGpuInfo(ctx) - case common.GpuVendorNone: + case gpu.GpuVendorNone: return []GpuInfo{} } return []GpuInfo{} @@ -99,7 +99,7 @@ func getNvidiaGpuInfo(ctx context.Context) []GpuInfo { vram = 0 } gpus = append(gpus, GpuInfo{ - Vendor: common.GpuVendorNvidia, + Vendor: gpu.GpuVendorNvidia, Name: strings.TrimSpace(record[0]), Vram: vram, ID: strings.TrimSpace(record[2]), @@ -170,7 +170,7 @@ func getAmdGpuInfo(ctx context.Context) []GpuInfo { continue } gpus = append(gpus, GpuInfo{ - Vendor: common.GpuVendorAmd, + Vendor: gpu.GpuVendorAmd, Name: amdGpu.Asic.Name, Vram: amdGpu.Vram.Size.Value, RenderNodePath: renderNodePath, @@ -233,7 +233,7 @@ func getGpusFromTtSmiSnapshot(snapshot *ttSmiSnapshot) []GpuInfo { // Create new GPU entry for "L" device lDeviceMap[uniqueID] = &GpuInfo{ - Vendor: common.GpuVendorTenstorrent, + Vendor: gpu.GpuVendorTenstorrent, Name: name, Vram: baseVram, ID: boardID, @@ -304,7 +304,7 @@ func getGpusFromTtSmiSnapshot(snapshot *ttSmiSnapshot) []GpuInfo { if !existingGpu { // Create new GPU entry lDeviceMap[uniqueID] = &GpuInfo{ - Vendor: common.GpuVendorTenstorrent, + Vendor: gpu.GpuVendorTenstorrent, Name: boardType, Vram: baseVram, ID: boardID, @@ -423,7 +423,7 @@ func getIntelGpuInfo(ctx context.Context) []GpuInfo { vram = 0 } gpus = append(gpus, GpuInfo{ - Vendor: common.GpuVendorIntel, + Vendor: gpu.GpuVendorIntel, Name: strings.TrimSpace(record[0]), Vram: vram, Index: strings.TrimSpace(record[2]), diff --git a/runner/internal/shim/host/gpu_test.go b/runner/internal/shim/host/gpu_test.go index 2f8eda8e2e..9facf9992a 100644 --- a/runner/internal/shim/host/gpu_test.go +++ b/runner/internal/shim/host/gpu_test.go @@ -7,7 +7,7 @@ import ( "strconv" "testing" - "github.com/dstackai/dstack/runner/internal/common" + "github.com/dstackai/dstack/runner/internal/common/gpu" ) func loadTestData(filename string) ([]byte, error) { @@ -172,7 +172,7 @@ func TestGetGpusFromTtSmiSnapshot(t *testing.T) { expectedGpus := []GpuInfo{ { - Vendor: common.GpuVendorTenstorrent, + Vendor: gpu.GpuVendorTenstorrent, Name: "n150", Vram: 12 * 1024, ID: "100018611902010", @@ -222,19 +222,19 @@ func TestGetGpusFromTtSmiSnapshotMultipleDevices(t *testing.T) { } for boardID, expected := range expectedGpus { - gpu, exists := gpusByID[boardID] + gpu_, exists := gpusByID[boardID] if !exists { t.Errorf("Expected GPU with board_id %s not found", boardID) continue } - if gpu.Name != expected.name { - t.Errorf("GPU %s: name = %s, want %s", boardID, gpu.Name, expected.name) + if gpu_.Name != expected.name { + t.Errorf("GPU %s: name = %s, want %s", boardID, gpu_.Name, expected.name) } - if gpu.Vram != expected.vram { - t.Errorf("GPU %s: VRAM = %d, want %d", boardID, gpu.Vram, expected.vram) + if gpu_.Vram != expected.vram { + t.Errorf("GPU %s: VRAM = %d, want %d", boardID, gpu_.Vram, expected.vram) } - if gpu.Vendor != common.GpuVendorTenstorrent { - t.Errorf("GPU %s: vendor = %v, want %v", boardID, gpu.Vendor, common.GpuVendorTenstorrent) + if gpu_.Vendor != gpu.GpuVendorTenstorrent { + t.Errorf("GPU %s: vendor = %v, want %v", boardID, gpu_.Vendor, gpu.GpuVendorTenstorrent) } } } @@ -263,25 +263,25 @@ func TestGetGpusFromTtSmiSnapshotGalaxy(t *testing.T) { actualTotalVram := 0 // Verify all GPUs have the correct properties - for i, gpu := range gpus { - if gpu.Vendor != common.GpuVendorTenstorrent { - t.Errorf("GPU[%d] vendor = %v, want %v", i, gpu.Vendor, common.GpuVendorTenstorrent) + for i, gpu_ := range gpus { + if gpu_.Vendor != gpu.GpuVendorTenstorrent { + t.Errorf("GPU[%d] vendor = %v, want %v", i, gpu_.Vendor, gpu.GpuVendorTenstorrent) } - if gpu.Name != "tt-galaxy-wh" { - t.Errorf("GPU[%d] name = %s, want tt-galaxy-wh", i, gpu.Name) + if gpu_.Name != "tt-galaxy-wh" { + t.Errorf("GPU[%d] name = %s, want tt-galaxy-wh", i, gpu_.Name) } - if gpu.ID != "100035100000000" { - t.Errorf("GPU[%d] ID = %s, want 100035100000000", i, gpu.ID) + if gpu_.ID != "100035100000000" { + t.Errorf("GPU[%d] ID = %s, want 100035100000000", i, gpu_.ID) } - if gpu.Vram != 12*1024 { - t.Errorf("GPU[%d] VRAM = %d, want %d", i, gpu.Vram, 12*1024) + if gpu_.Vram != 12*1024 { + t.Errorf("GPU[%d] VRAM = %d, want %d", i, gpu_.Vram, 12*1024) } // Verify indices are sequential (0, 1, 2, ..., 31) expectedIndex := strconv.Itoa(i) - if gpu.Index != expectedIndex { - t.Errorf("GPU[%d] index = %s, want %s", i, gpu.Index, expectedIndex) + if gpu_.Index != expectedIndex { + t.Errorf("GPU[%d] index = %s, want %s", i, gpu_.Index, expectedIndex) } - actualTotalVram += gpu.Vram + actualTotalVram += gpu_.Vram } // Verify total VRAM is 384GB diff --git a/runner/internal/shim/host/host.go b/runner/internal/shim/host/host.go index bc54a407c7..84d15d1ae8 100644 --- a/runner/internal/shim/host/host.go +++ b/runner/internal/shim/host/host.go @@ -9,7 +9,7 @@ import ( "github.com/shirou/gopsutil/v4/mem" "golang.org/x/sys/unix" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" ) func GetCpuCount(ctx context.Context) int { diff --git a/runner/internal/shim/host_info.go b/runner/internal/shim/host_info.go index ea717e112c..2634d939c3 100644 --- a/runner/internal/shim/host_info.go +++ b/runner/internal/shim/host_info.go @@ -7,18 +7,18 @@ import ( "os" "path/filepath" - "github.com/dstackai/dstack/runner/internal/common" + "github.com/dstackai/dstack/runner/internal/common/gpu" ) type hostInfo struct { - GpuVendor common.GpuVendor `json:"gpu_vendor"` - GpuName string `json:"gpu_name"` - GpuMemory int `json:"gpu_memory"` // MiB - GpuCount int `json:"gpu_count"` - Addresses []string `json:"addresses"` - DiskSize uint64 `json:"disk_size"` // bytes - NumCPUs int `json:"cpus"` - Memory uint64 `json:"memory"` // bytes + GpuVendor gpu.GpuVendor `json:"gpu_vendor"` + GpuName string `json:"gpu_name"` + GpuMemory int `json:"gpu_memory"` // MiB + GpuCount int `json:"gpu_count"` + Addresses []string `json:"addresses"` + DiskSize uint64 `json:"disk_size"` // bytes + NumCPUs int `json:"cpus"` + Memory uint64 `json:"memory"` // bytes } func WriteHostInfo(dir string, resources Resources) error { @@ -28,7 +28,7 @@ func WriteHostInfo(dir string, resources Resources) error { return err } - gpuVendor := common.GpuVendorNone + gpuVendor := gpu.GpuVendorNone gpuCount := 0 gpuMemory := 0 gpuName := "" diff --git a/runner/internal/shim/resources.go b/runner/internal/shim/resources.go index bcc589f272..e0d888873b 100644 --- a/runner/internal/shim/resources.go +++ b/runner/internal/shim/resources.go @@ -6,8 +6,8 @@ import ( "fmt" "sync" - "github.com/dstackai/dstack/runner/internal/common" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/gpu" + "github.com/dstackai/dstack/runner/internal/common/log" "github.com/dstackai/dstack/runner/internal/shim/host" ) @@ -33,21 +33,21 @@ func NewGpuLock(gpus []host.GpuInfo) (*GpuLock, error) { lock := make(map[string]bool, len(gpus)) if len(gpus) > 0 { vendor := gpus[0].Vendor - for _, gpu := range gpus { - if gpu.Vendor != vendor { + for _, gpu_ := range gpus { + if gpu_.Vendor != vendor { return nil, errors.New("multiple GPU vendors detected") } var resourceID string switch vendor { - case common.GpuVendorNvidia: - resourceID = gpu.ID - case common.GpuVendorAmd: - resourceID = gpu.RenderNodePath - case common.GpuVendorTenstorrent: - resourceID = gpu.Index - case common.GpuVendorIntel: - resourceID = gpu.Index - case common.GpuVendorNone: + case gpu.GpuVendorNvidia: + resourceID = gpu_.ID + case gpu.GpuVendorAmd: + resourceID = gpu_.RenderNodePath + case gpu.GpuVendorTenstorrent: + resourceID = gpu_.Index + case gpu.GpuVendorIntel: + resourceID = gpu_.Index + case gpu.GpuVendorNone: return nil, fmt.Errorf("unexpected GPU vendor %s", vendor) default: return nil, fmt.Errorf("unexpected GPU vendor %s", vendor) diff --git a/runner/internal/shim/resources_test.go b/runner/internal/shim/resources_test.go index f582d14cf2..424ff55b41 100644 --- a/runner/internal/shim/resources_test.go +++ b/runner/internal/shim/resources_test.go @@ -4,7 +4,7 @@ import ( "context" "testing" - "github.com/dstackai/dstack/runner/internal/common" + "github.com/dstackai/dstack/runner/internal/common/gpu" "github.com/dstackai/dstack/runner/internal/shim/host" "github.com/stretchr/testify/assert" ) @@ -18,8 +18,8 @@ func TestNewGpuLock_NoGpus(t *testing.T) { func TestNewGpuLock_NvidiaGpus(t *testing.T) { gpus := []host.GpuInfo{ - {Vendor: common.GpuVendorNvidia, ID: "GPU-beef"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, } gl, err := NewGpuLock(gpus) assert.Nil(t, err) @@ -32,8 +32,8 @@ func TestNewGpuLock_NvidiaGpus(t *testing.T) { func TestNewGpuLock_AmdGpus(t *testing.T) { gpus := []host.GpuInfo{ - {Vendor: common.GpuVendorAmd, RenderNodePath: "/dev/dri/renderD128"}, - {Vendor: common.GpuVendorAmd, RenderNodePath: "/dev/dri/renderD129"}, + {Vendor: gpu.GpuVendorAmd, RenderNodePath: "/dev/dri/renderD128"}, + {Vendor: gpu.GpuVendorAmd, RenderNodePath: "/dev/dri/renderD129"}, } gl, err := NewGpuLock(gpus) assert.Nil(t, err) @@ -46,8 +46,8 @@ func TestNewGpuLock_AmdGpus(t *testing.T) { func TestNewGpuLock_ErrorMultipleVendors(t *testing.T) { gpus := []host.GpuInfo{ - {Vendor: common.GpuVendorAmd}, - {Vendor: common.GpuVendorNvidia}, + {Vendor: gpu.GpuVendorAmd}, + {Vendor: gpu.GpuVendorNvidia}, } gl, err := NewGpuLock(gpus) assert.Nil(t, gl) @@ -68,9 +68,9 @@ func TestGpuLock_Acquire_ErrorBadCount(t *testing.T) { func TestGpuLock_Acquire_All_Available(t *testing.T) { gpus := []host.GpuInfo{ - {Vendor: common.GpuVendorNvidia, ID: "GPU-beef"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-f00d"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-c0de"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-c0de"}, } gl, _ := NewGpuLock(gpus) gl.lock["GPU-f00d"] = true @@ -84,8 +84,8 @@ func TestGpuLock_Acquire_All_Available(t *testing.T) { func TestGpuLock_Acquire_All_NoneAvailable(t *testing.T) { gpus := []host.GpuInfo{ - {Vendor: common.GpuVendorNvidia, ID: "GPU-beef"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, } gl, _ := NewGpuLock(gpus) gl.lock["GPU-beef"] = true @@ -104,10 +104,10 @@ func TestGpuLock_Acquire_All_NoGpus(t *testing.T) { func TestGpuLock_Acquire_Count_OK(t *testing.T) { gpus := []host.GpuInfo{ - {Vendor: common.GpuVendorNvidia, ID: "GPU-beef"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-f00d"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-c0de"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-cafe"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-c0de"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-cafe"}, } gl, _ := NewGpuLock(gpus) gl.lock["GPU-f00d"] = true @@ -128,8 +128,8 @@ func TestGpuLock_Acquire_Count_OK(t *testing.T) { func TestGpuLock_Acquire_Count_ErrNoCapacity(t *testing.T) { gpus := []host.GpuInfo{ - {Vendor: common.GpuVendorNvidia, ID: "GPU-beef"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, } gl, _ := NewGpuLock(gpus) gl.lock["GPU-f00d"] = true @@ -142,9 +142,9 @@ func TestGpuLock_Acquire_Count_ErrNoCapacity(t *testing.T) { func TestGpuLock_Lock(t *testing.T) { gpus := []host.GpuInfo{ - {Vendor: common.GpuVendorNvidia, ID: "GPU-beef"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-f00d"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-c0de"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-c0de"}, } gl, _ := NewGpuLock(gpus) gl.lock["GPU-beef"] = true @@ -162,8 +162,8 @@ func TestGpuLock_Lock(t *testing.T) { func TestGpuLock_Lock_Nil(t *testing.T) { gpus := []host.GpuInfo{ - {Vendor: common.GpuVendorNvidia, ID: "GPU-beef"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, } gl, _ := NewGpuLock(gpus) gl.lock["GPU-beef"] = true @@ -176,9 +176,9 @@ func TestGpuLock_Lock_Nil(t *testing.T) { func TestGpuLock_Release(t *testing.T) { gpus := []host.GpuInfo{ - {Vendor: common.GpuVendorNvidia, ID: "GPU-beef"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-f00d"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-c0de"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-c0de"}, } gl, _ := NewGpuLock(gpus) gl.lock["GPU-beef"] = true @@ -196,8 +196,8 @@ func TestGpuLock_Release(t *testing.T) { func TestGpuLock_Release_Nil(t *testing.T) { gpus := []host.GpuInfo{ - {Vendor: common.GpuVendorNvidia, ID: "GPU-beef"}, - {Vendor: common.GpuVendorNvidia, ID: "GPU-f00d"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-beef"}, + {Vendor: gpu.GpuVendorNvidia, ID: "GPU-f00d"}, } gl, _ := NewGpuLock(gpus) gl.lock["GPU-beef"] = true diff --git a/runner/internal/shim/task.go b/runner/internal/shim/task.go index f1d67b785c..d2fef7e02d 100644 --- a/runner/internal/shim/task.go +++ b/runner/internal/shim/task.go @@ -6,7 +6,7 @@ import ( "fmt" "sync" - "github.com/dstackai/dstack/runner/internal/log" + "github.com/dstackai/dstack/runner/internal/common/log" ) type TaskStatus string From 885521d7d49df0e1dd9a51e8dd020fdc7e794fe2 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Tue, 3 Mar 2026 10:10:29 +0000 Subject: [PATCH 179/187] Display imported fleets with project prefix in CLI (#3630) --- src/dstack/_internal/cli/commands/fleet.py | 8 +++- .../cli/services/configurators/fleet.py | 6 ++- src/dstack/_internal/cli/utils/fleet.py | 15 ++++-- src/tests/_internal/cli/utils/test_fleet.py | 46 ++++++++++++++----- 4 files changed, 55 insertions(+), 20 deletions(-) diff --git a/src/dstack/_internal/cli/commands/fleet.py b/src/dstack/_internal/cli/commands/fleet.py index 130e2c3fcf..d58f50cc2a 100644 --- a/src/dstack/_internal/cli/commands/fleet.py +++ b/src/dstack/_internal/cli/commands/fleet.py @@ -95,13 +95,17 @@ def _command(self, args: argparse.Namespace): def _list(self, args: argparse.Namespace): fleets = self.api.client.fleets.list(self.api.project) if not args.watch: - print_fleets_table(fleets, verbose=args.verbose) + print_fleets_table(fleets, current_project=self.api.project, verbose=args.verbose) return try: with Live(console=console, refresh_per_second=LIVE_TABLE_REFRESH_RATE_PER_SEC) as live: while True: - live.update(get_fleets_table(fleets, verbose=args.verbose)) + live.update( + get_fleets_table( + fleets, current_project=self.api.project, verbose=args.verbose + ) + ) time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) fleets = self.api.client.fleets.list(self.api.project) except KeyboardInterrupt: diff --git a/src/dstack/_internal/cli/services/configurators/fleet.py b/src/dstack/_internal/cli/services/configurators/fleet.py index 27b607cb4a..fe1dd4c0cd 100644 --- a/src/dstack/_internal/cli/services/configurators/fleet.py +++ b/src/dstack/_internal/cli/services/configurators/fleet.py @@ -141,7 +141,7 @@ def _apply_plan(self, plan: FleetPlan, command_args: argparse.Namespace): f"Provisioning [code]{fleet.name}[/]...", console=console ) as live: while not _finished_provisioning(fleet): - table = get_fleets_table([fleet]) + table = get_fleets_table([fleet], current_project=self.api.project) live.update(table) time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) fleet = self.api.client.fleets.get(self.api.project, fleet.name) @@ -159,6 +159,7 @@ def _apply_plan(self, plan: FleetPlan, command_args: argparse.Namespace): [fleet], verbose=_fleet_has_failed_instances(fleet), format_date=local_time, + current_project=self.api.project, ) ) if _fleet_has_failed_instances(fleet): @@ -242,7 +243,7 @@ def _apply_plan_on_old_server(self, plan: FleetPlan, command_args: argparse.Name f"Provisioning [code]{fleet.name}[/]...", console=console ) as live: while not _finished_provisioning(fleet): - table = get_fleets_table([fleet]) + table = get_fleets_table([fleet], current_project=self.api.project) live.update(table) time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) fleet = self.api.client.fleets.get(self.api.project, fleet.name) @@ -260,6 +261,7 @@ def _apply_plan_on_old_server(self, plan: FleetPlan, command_args: argparse.Name [fleet], verbose=_fleet_has_failed_instances(fleet), format_date=local_time, + current_project=self.api.project, ) ) if _fleet_has_failed_instances(fleet): diff --git a/src/dstack/_internal/cli/utils/fleet.py b/src/dstack/_internal/cli/utils/fleet.py index fdca4270ab..70e6b5d740 100644 --- a/src/dstack/_internal/cli/utils/fleet.py +++ b/src/dstack/_internal/cli/utils/fleet.py @@ -10,13 +10,16 @@ from dstack._internal.utils.common import DateFormatter, pretty_date -def print_fleets_table(fleets: List[Fleet], verbose: bool = False) -> None: - console.print(get_fleets_table(fleets, verbose=verbose)) +def print_fleets_table(fleets: List[Fleet], current_project: str, verbose: bool = False) -> None: + console.print(get_fleets_table(fleets, current_project=current_project, verbose=verbose)) console.print() def get_fleets_table( - fleets: List[Fleet], verbose: bool = False, format_date: DateFormatter = pretty_date + fleets: List[Fleet], + current_project: str, + verbose: bool = False, + format_date: DateFormatter = pretty_date, ) -> Table: table = Table(box=None) @@ -40,6 +43,10 @@ def get_fleets_table( config = fleet.spec.configuration merged_profile = fleet.spec.merged_profile + name = fleet.name + if fleet.project_name != current_project: + name = f"{fleet.project_name}/{fleet.name}" + # Detect SSH fleet vs backend fleet if config.ssh_config is not None: # SSH fleet: fixed number of hosts, no cloud billing @@ -65,7 +72,7 @@ def get_fleets_table( nodes = f"{nodes} (cluster)" fleet_row: Dict[Union[str, int], Any] = { - "NAME": fleet.name, + "NAME": name, "NODES": nodes, "BACKEND": backend, "PRICE": max_price, diff --git a/src/tests/_internal/cli/utils/test_fleet.py b/src/tests/_internal/cli/utils/test_fleet.py index 1c1df4df22..82b4894095 100644 --- a/src/tests/_internal/cli/utils/test_fleet.py +++ b/src/tests/_internal/cli/utils/test_fleet.py @@ -126,6 +126,7 @@ def create_backend_fleet( gpu_count_max: int = 0, instances: Optional[List[Instance]] = None, status: FleetStatus = FleetStatus.ACTIVE, + project_name: str = "test-project", ) -> Fleet: nodes = FleetNodesSpec(min=nodes_min, target=nodes_min, max=nodes_max) @@ -154,7 +155,7 @@ def create_backend_fleet( return Fleet( id=uuid4(), name=name, - project_name="test-project", + project_name=project_name, spec=spec, created_at=datetime(2023, 1, 2, 3, 4, 5, tzinfo=timezone.utc), status=status, @@ -222,7 +223,7 @@ def test_backend_fleet_without_verbose(self): instances=[instance], ) - table = get_fleets_table([fleet], verbose=False) + table = get_fleets_table([fleet], current_project="test-project", verbose=False) cells = get_table_cells(table) assert len(cells) == 2 # 1 fleet row + 1 instance row @@ -262,7 +263,7 @@ def test_backend_fleet_with_verbose(self): instances=[instance], ) - table = get_fleets_table([fleet], verbose=True) + table = get_fleets_table([fleet], current_project="test-project", verbose=True) cells = get_table_cells(table) assert len(cells) == 2 @@ -310,7 +311,7 @@ def test_ssh_fleet_without_verbose(self): instances=[instance1, instance2], ) - table = get_fleets_table([fleet], verbose=False) + table = get_fleets_table([fleet], current_project="test-project", verbose=False) cells = get_table_cells(table) assert len(cells) == 3 # 1 fleet row + 2 instance rows @@ -345,7 +346,7 @@ def test_ssh_fleet_with_verbose(self): instances=[instance], ) - table = get_fleets_table([fleet], verbose=True) + table = get_fleets_table([fleet], current_project="test-project", verbose=True) cells = get_table_cells(table) assert len(cells) == 2 @@ -395,7 +396,9 @@ def test_mixed_fleets(self): instances=[ssh_instance], ) - table = get_fleets_table([backend_fleet, ssh_fleet], verbose=False) + table = get_fleets_table( + [backend_fleet, ssh_fleet], current_project="test-project", verbose=False + ) cells = get_table_cells(table) assert len(cells) == 4 # 2 fleet rows + 2 instance rows @@ -433,7 +436,9 @@ def test_fleet_status_colors(self): name="terminating", status=FleetStatus.TERMINATING, instances=[terminating_instance] ) - table = get_fleets_table([active_fleet, terminating_fleet], verbose=False) + table = get_fleets_table( + [active_fleet, terminating_fleet], current_project="test-project", verbose=False + ) active_style = get_table_cell_style(table, "STATUS", 0) assert active_style == "bold white" @@ -451,7 +456,7 @@ def test_instance_status_colors(self): instances=[idle_instance, busy_instance], ) - table = get_fleets_table([fleet], verbose=False) + table = get_fleets_table([fleet], current_project="test-project", verbose=False) idle_style = get_table_cell_style(table, "STATUS", 1) assert idle_style == "bold sea_green3" @@ -462,7 +467,7 @@ def test_instance_status_colors(self): def test_empty_fleet(self): fleet = create_backend_fleet(name="empty-fleet", instances=[]) - table = get_fleets_table([fleet], verbose=False) + table = get_fleets_table([fleet], current_project="test-project", verbose=False) cells = get_table_cells(table) assert len(cells) == 1 @@ -474,7 +479,7 @@ def test_fleet_with_max_price(self): max_price=5.0, ) - table = get_fleets_table([fleet], verbose=False) + table = get_fleets_table([fleet], current_project="test-project", verbose=False) cells = get_table_cells(table) assert cells[0]["PRICE"] == "$0..$5" @@ -485,7 +490,7 @@ def test_fleet_with_multiple_backends(self): backends=[BackendType.AWS, BackendType.GCP, BackendType.AZURE], ) - table = get_fleets_table([fleet], verbose=False) + table = get_fleets_table([fleet], current_project="test-project", verbose=False) cells = get_table_cells(table) assert cells[0]["BACKEND"] == "aws, gcp, azure" @@ -496,7 +501,24 @@ def test_fleet_with_any_backend(self): backends=None, ) - table = get_fleets_table([fleet], verbose=False) + table = get_fleets_table([fleet], current_project="test-project", verbose=False) cells = get_table_cells(table) assert cells[0]["BACKEND"] == "*" + + def test_with_imported_fleet(self): + current_project_fleet = create_backend_fleet( + name="current-fleet", project_name="current-project" + ) + other_project_fleet = create_backend_fleet( + name="other-fleet", project_name="other-project" + ) + table = get_fleets_table( + [current_project_fleet, other_project_fleet], + verbose=False, + current_project="current-project", + ) + cells = get_table_cells(table) + assert len(cells) == 2 + assert cells[0]["NAME"] == "current-fleet" + assert cells[1]["NAME"] == "other-project/other-fleet" From 86a463998f315a32c5cab3c480c8348e9b73d28c Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Tue, 3 Mar 2026 11:48:08 +0100 Subject: [PATCH 180/187] Fix Crusoe CPU instances and add H200/B200 support (#3619) * Fix Crusoe CPU instances and add H200/B200 support - Use docker-enabled VM image for CPU types (base image lacks Docker) - Update gpuhunt with H200, B200, and CPU instance support --- pyproject.toml | 2 +- src/dstack/_internal/core/backends/crusoe/compute.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3336fc5423..259cbf7b25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "python-multipart>=0.0.16", "filelock", "psutil", - "gpuhunt==0.1.17", + "gpuhunt==0.1.18", "argcomplete>=3.5.0", "ignore-python>=0.2.0", "orjson", diff --git a/src/dstack/_internal/core/backends/crusoe/compute.py b/src/dstack/_internal/core/backends/crusoe/compute.py index 10ede96776..7de7d0967c 100644 --- a/src/dstack/_internal/core/backends/crusoe/compute.py +++ b/src/dstack/_internal/core/backends/crusoe/compute.py @@ -91,18 +91,17 @@ IMAGE_SXM_DOCKER = "ubuntu22.04-nvidia-sxm-docker:latest" IMAGE_PCIE_DOCKER = "ubuntu22.04-nvidia-pcie-docker:latest" IMAGE_ROCM = "ubuntu-rocm:latest" -IMAGE_BASE = "ubuntu22.04:latest" def _get_image(instance_name: str, gpu_type: str) -> str: - if not gpu_type: - return IMAGE_BASE # Check instance name for SXM -- gpu_type from gpuhunt is normalized (e.g. "A100") # and doesn't contain "SXM", but instance names like "a100-80gb-sxm-ib.8x" do. if "-sxm" in instance_name.lower(): return IMAGE_SXM_DOCKER if "MI3" in gpu_type: return IMAGE_ROCM + # Use PCIe docker image for both PCIe GPUs and CPU-only types. + # Crusoe has no CPU-specific Docker image; the base ubuntu image lacks Docker. return IMAGE_PCIE_DOCKER From 18fd035e5f58a7a45921ebc81baf6ab642716b07 Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Tue, 3 Mar 2026 13:03:37 +0000 Subject: [PATCH 181/187] Do not show SSH fleet resources in `dstack fleet` (#3632) Always show `-` for SSH fleets, because resources are not applicable to them. Also simplify the implementation by building `fleet_row` and `instance_row` without `if verbose` checks. `if verbose` is only necessary when building the table header, and then `add_row_from_dict()` will include only the columns specified in the header. --- src/dstack/_internal/cli/utils/fleet.py | 24 +++++++++------------ src/tests/_internal/cli/utils/test_fleet.py | 1 + 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/dstack/_internal/cli/utils/fleet.py b/src/dstack/_internal/cli/utils/fleet.py index 70e6b5d740..2253d0a733 100644 --- a/src/dstack/_internal/cli/utils/fleet.py +++ b/src/dstack/_internal/cli/utils/fleet.py @@ -51,12 +51,16 @@ def get_fleets_table( if config.ssh_config is not None: # SSH fleet: fixed number of hosts, no cloud billing nodes = str(len(config.ssh_config.hosts)) + resources = "-" + gpu = "-" backend = "ssh" spot_policy = "-" max_price = "-" else: # Backend fleet: dynamic nodes, cloud billing nodes = _format_nodes(config.nodes) + resources = config.resources.pretty_format() if config.resources else "-" + gpu = _format_fleet_gpu(config.resources) backend = _format_backends(config.backends) spot_policy = "-" if merged_profile and merged_profile.spot_policy: @@ -74,6 +78,8 @@ def get_fleets_table( fleet_row: Dict[Union[str, int], Any] = { "NAME": name, "NODES": nodes, + "RESOURCES": resources, + "GPU": gpu, "BACKEND": backend, "PRICE": max_price, "SPOT": spot_policy, @@ -81,12 +87,6 @@ def get_fleets_table( "CREATED": format_date(fleet.created_at), } - if verbose: - fleet_row["RESOURCES"] = config.resources.pretty_format() if config.resources else "-" - fleet_row["ERROR"] = "" - else: - fleet_row["GPU"] = _format_fleet_gpu(config.resources) - add_row_from_dict(table, fleet_row) # Instance rows (indented) @@ -119,6 +119,8 @@ def get_fleets_table( instance_row: Dict[Union[str, int], Any] = { "NAME": f" instance={instance.instance_num}", "NODES": "", + "RESOURCES": _format_instance_resources(instance), + "GPU": _format_instance_gpu(instance), "BACKEND": backend_with_region, "PRICE": instance_price, "SPOT": instance_spot, @@ -126,14 +128,8 @@ def get_fleets_table( "CREATED": format_date(instance.created), } - if verbose: - instance_row["RESOURCES"] = _format_instance_resources(instance) - error = "" - if instance.status == InstanceStatus.TERMINATED and instance.termination_reason: - error = instance.termination_reason - instance_row["ERROR"] = error - else: - instance_row["GPU"] = _format_instance_gpu(instance) + if instance.status == InstanceStatus.TERMINATED and instance.termination_reason: + instance_row["ERROR"] = instance.termination_reason add_row_from_dict(table, instance_row, style="secondary") diff --git a/src/tests/_internal/cli/utils/test_fleet.py b/src/tests/_internal/cli/utils/test_fleet.py index 82b4894095..00fedff685 100644 --- a/src/tests/_internal/cli/utils/test_fleet.py +++ b/src/tests/_internal/cli/utils/test_fleet.py @@ -354,6 +354,7 @@ def test_ssh_fleet_with_verbose(self): fleet_row = cells[0] assert fleet_row["NAME"] == "my-ssh" assert fleet_row["NODES"] == "1 (cluster)" + assert fleet_row["RESOURCES"] == "-" assert fleet_row["BACKEND"] == "ssh" assert fleet_row["SPOT"] == "-" assert fleet_row["PRICE"] == "-" From 4b4d1f68cf23d89b1e339893356cd08505fa1f2a Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov <54148038+peterschmidt85@users.noreply.github.com> Date: Wed, 4 Mar 2026 15:38:11 +0100 Subject: [PATCH 182/187] [UI] Unify Connect UX across run configuration types (plus fix) (#3622) * [UI] Add Connect section for tasks with ports and fix launch wizard issues Add a Connect component for task runs that expose ports, with a two-step wizard (Attach + Open). Single port shows a simple button; multiple ports use a ButtonDropdown to select and open any forwarded port. Also fix two launch wizard issues: - Add `ports` to the supported YAML fields whitelist - Stop setting `docker: true` when selecting a Docker image (unrelated field) - Fix `IJobSpec.app_specs` type from singular to array - Show `-` for empty configuration path on run details Made-with: Cursor * [UI] Refine Connect wizard interactions Address PR feedback by making Connect panels collapsible across run types, using Done to collapse wizard flows, and fixing task Open-step behavior when map_to_port is unset. Made-with: Cursor --------- Co-authored-by: Andrey Cheptsov --- .../pages/Project/Details/Settings/index.tsx | 2 +- .../index.tsx | 40 ++- .../RunDetails/ConnectToServiceRun/index.tsx | 85 +++++- .../RunDetails/ConnectToTaskRun/index.tsx | 269 ++++++++++++++++++ .../pages/Runs/Details/RunDetails/index.tsx | 8 +- .../components/ParamsWizardStep/index.tsx | 2 - frontend/src/pages/Runs/Launch/constants.tsx | 10 +- .../Runs/Launch/hooks/useGenerateYaml.ts | 4 +- .../Launch/hooks/useGetRunSpecFromYaml.ts | 1 + frontend/src/pages/Runs/Launch/types.ts | 1 - frontend/src/types/run.d.ts | 2 +- 11 files changed, 373 insertions(+), 51 deletions(-) create mode 100644 frontend/src/pages/Runs/Details/RunDetails/ConnectToTaskRun/index.tsx diff --git a/frontend/src/pages/Project/Details/Settings/index.tsx b/frontend/src/pages/Project/Details/Settings/index.tsx index 7d2b9bd3f0..45e5bb3a90 100644 --- a/frontend/src/pages/Project/Details/Settings/index.tsx +++ b/frontend/src/pages/Project/Details/Settings/index.tsx @@ -230,7 +230,7 @@ export const ProjectSettings: React.FC = () => { onNavigate={({ detail }) => setActiveStepIndex(detail.requestedStepIndex)} activeStepIndex={activeStepIndex} onSubmit={() => setIsExpandedCliSection(false)} - submitButtonText="Dismiss" + submitButtonText="Done" allowSkipTo={true} steps={[ { diff --git a/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx b/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx index 54d03c388d..bcd618a0dd 100644 --- a/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx +++ b/frontend/src/pages/Runs/Details/RunDetails/ConnectToRunWithDevEnvConfiguration/index.tsx @@ -1,20 +1,7 @@ import React, { FC } from 'react'; import { useTranslation } from 'react-i18next'; -import { - Alert, - Box, - Button, - Code, - Container, - ExpandableSection, - Header, - Popover, - SpaceBetween, - StatusIndicator, - Tabs, - Wizard, -} from 'components'; +import { Alert, Box, Button, Code, ExpandableSection, Popover, SpaceBetween, StatusIndicator, Tabs, Wizard } from 'components'; import { copyToClipboard } from 'libs'; @@ -28,6 +15,7 @@ const PipInstallCommand = 'pip install dstack -U'; export const ConnectToRunWithDevEnvConfiguration: FC<{ run: IRun }> = ({ run }) => { const { t } = useTranslation(); + const [isExpandedConnectSection, setIsExpandedConnectSection] = React.useState(true); const getAttachCommand = (runData: IRun) => { const attachCommand = `dstack attach ${runData.run_spec.run_name} --logs`; @@ -62,9 +50,19 @@ export const ConnectToRunWithDevEnvConfiguration: FC<{ run: IRun }> = ({ run }) const [configCliCommand, copyCliCommand] = useConfigProjectCliCommand({ projectName: run.project_name }); return ( - -
    Connect
    - + setIsExpandedConnectSection(detail.expanded)} + headerActions={ +
    +
    + + + + + To use dstack, install the CLI on your local machine. + + +
    + {UvInstallCommand} + +
    + + {t('common.copied')} + + } + > +
    +
    + + ), + }, + { + label: 'pip', + id: 'pip', + content: ( + <> +
    + {PipInstallCommand} + +
    + + {t('common.copied')} + + } + > +
    +
    + + ), + }, + ]} + /> + + And then configure the project. + +
    + {configCliCommand} + +
    + + {t('common.copied')} + + } + > +
    +
    +
    +
    + + ), + isOptional: true, + }, + ...(mappedAppSpecs.length > 0 + ? [ + { + title: 'Open', + description: 'After the CLI is attached, use the forwarded localhost URLs.', + content: ( + + {mappedAppSpecs.map((spec) => { + const mappedPort = getMappedPort(spec)!; + const localUrl = `http://127.0.0.1:${mappedPort}`; + + return ( + + {t('common.copied')} + + } + > +
    = ({ filteringAriaLabel: t('projects.run.filter_property_placeholder'), filteringPlaceholder: t('projects.run.filter_property_placeholder'), operationAndText: 'and', + enteredTextLabel: (value) => `Use: ${value}`, }} filteringOptions={filteringOptions} filteringProperties={filteringProperties} + filteringStatusType={filteringStatusType} + onLoadItems={handleLoadItems} /> diff --git a/frontend/src/pages/Fleets/List/hooks.tsx b/frontend/src/pages/Fleets/List/hooks.tsx index 639d7b8683..bfff69d0ec 100644 --- a/frontend/src/pages/Fleets/List/hooks.tsx +++ b/frontend/src/pages/Fleets/List/hooks.tsx @@ -8,10 +8,19 @@ import type { PropertyFilterProps } from 'components'; import { Button, ListEmptyMessage, NavigateLink, StatusIndicator, TableProps } from 'components'; import { DATE_TIME_FORMAT } from 'consts'; -import { useProjectFilter } from 'hooks/useProjectFilter'; +import { useLocalStorageState } from 'hooks'; import { EMPTY_QUERY, requestParamsToTokens, tokensToRequestParams, tokensToSearchParams } from 'libs/filters'; -import { formatFleetBackend, formatFleetResources, getFleetInstancesLinkText, getFleetPrice, getFleetStatusIconType } from 'libs/fleet'; +import { + formatFleetBackend, + formatFleetResources, + getFleetInstancesLinkText, + getFleetPrice, + getFleetStatusIconType, +} from 'libs/fleet'; import { ROUTES } from 'routes'; +import { useLazyGetProjectsQuery } from 'services/project'; + +const limit = 100; export const useEmptyMessages = ({ clearFilter, @@ -115,10 +124,12 @@ const filterKeys: Record = { PROJECT_NAME: 'project_name', }; -export const useFilters = (localStorePrefix = 'fleet-list-page') => { +export const useFilters = () => { const [searchParams, setSearchParams] = useSearchParams(); - const [onlyActive, setOnlyActive] = useState(() => searchParams.get('only_active') === 'true'); - const { projectOptions } = useProjectFilter({ localStorePrefix }); + const [onlyActive, setOnlyActive] = useLocalStorageState('fleet-list-filter-only-active', true); + const [dynamicFilteringOptions, setDynamicFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => requestParamsToTokens({ searchParams, filterKeys }), @@ -126,23 +137,12 @@ export const useFilters = (localStorePrefix = 'fleet-list-page') => { const clearFilter = () => { setSearchParams({}); - setOnlyActive(false); setPropertyFilterQuery(EMPTY_QUERY); }; const filteringOptions = useMemo(() => { - const options: PropertyFilterProps.FilteringOption[] = []; - - projectOptions.forEach(({ value }) => { - if (value) - options.push({ - propertyKey: filterKeys.PROJECT_NAME, - value, - }); - }); - - return options; - }, [projectOptions]); + return [...dynamicFilteringOptions]; + }, [dynamicFilteringOptions]); const filteringProperties = [ { @@ -170,8 +170,6 @@ export const useFilters = (localStorePrefix = 'fleet-list-page') => { const onChangeOnlyActive: ToggleProps['onChange'] = ({ detail }) => { setOnlyActive(detail.checked); - - setSearchParams(tokensToSearchParams(propertyFilterQuery.tokens, detail.checked)); }; const filteringRequestParams = useMemo(() => { @@ -187,6 +185,30 @@ export const useFilters = (localStorePrefix = 'fleet-list-page') => { const isDisabledClearFilter = !propertyFilterQuery.tokens.length && !onlyActive; + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setDynamicFilteringOptions([]); + + if (!filteringText.length) { + return Promise.resolve(); + } + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.PROJECT_NAME) { + await getProjects({ name_pattern: filteringText, limit }) + .unwrap() + .then(({ data }) => + data.map(({ project_name }) => ({ + propertyKey: filterKeys.PROJECT_NAME, + value: project_name, + })), + ) + .then(setDynamicFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + return { filteringRequestParams, clearFilter, @@ -197,5 +219,7 @@ export const useFilters = (localStorePrefix = 'fleet-list-page') => { onlyActive, onChangeOnlyActive, isDisabledClearFilter, + filteringStatusType, + handleLoadItems, } as const; }; diff --git a/frontend/src/pages/Fleets/List/index.tsx b/frontend/src/pages/Fleets/List/index.tsx index 0a29192e0c..7e5ef21cf5 100644 --- a/frontend/src/pages/Fleets/List/index.tsx +++ b/frontend/src/pages/Fleets/List/index.tsx @@ -36,6 +36,8 @@ export const FleetList: React.FC = () => { onlyActive, onChangeOnlyActive, isDisabledClearFilter, + filteringStatusType, + handleLoadItems, } = useFilters(); const projectHavingFleetMap = useCheckingForFleetsInProjects({}); @@ -127,9 +129,12 @@ export const FleetList: React.FC = () => { filteringAriaLabel: t('fleets.filter_property_placeholder'), filteringPlaceholder: t('fleets.filter_property_placeholder'), operationAndText: 'and', + enteredTextLabel: (value) => `Use: ${value}`, }} filteringOptions={filteringOptions} filteringProperties={filteringProperties} + filteringStatusType={filteringStatusType} + onLoadItems={handleLoadItems} /> diff --git a/frontend/src/pages/Instances/List/hooks/useFilters.ts b/frontend/src/pages/Instances/List/hooks/useFilters.ts index 55453c33e4..bb3a5286bc 100644 --- a/frontend/src/pages/Instances/List/hooks/useFilters.ts +++ b/frontend/src/pages/Instances/List/hooks/useFilters.ts @@ -4,8 +4,9 @@ import { ToggleProps } from '@cloudscape-design/components'; import type { PropertyFilterProps } from 'components'; -import { useProjectFilter } from 'hooks/useProjectFilter'; +import { useLocalStorageState } from 'hooks'; import { EMPTY_QUERY, requestParamsToTokens, tokensToRequestParams, tokensToSearchParams } from 'libs/filters'; +import { useLazyGetProjectsQuery } from 'services/project'; type RequestParamsKeys = keyof Pick; @@ -14,10 +15,14 @@ const filterKeys: Record = { FLEET_IDS: 'fleet_ids', }; -export const useFilters = (localStorePrefix = 'instances-list-page') => { +const limit = 100; + +export const useFilters = () => { const [searchParams, setSearchParams] = useSearchParams(); - const [onlyActive, setOnlyActive] = useState(() => searchParams.get('only_active') === 'true'); - const { projectOptions } = useProjectFilter({ localStorePrefix }); + const [onlyActive, setOnlyActive] = useLocalStorageState('instance-list-filter-only-active', true); + const [dynamicFilteringOptions, setDynamicFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => { return requestParamsToTokens({ searchParams, filterKeys }); @@ -25,23 +30,12 @@ export const useFilters = (localStorePrefix = 'instances-list-page') => { const clearFilter = () => { setSearchParams({}); - setOnlyActive(false); setPropertyFilterQuery(EMPTY_QUERY); }; const filteringOptions = useMemo(() => { - const options: PropertyFilterProps.FilteringOption[] = []; - - projectOptions.forEach(({ value }) => { - if (value) - options.push({ - propertyKey: filterKeys.PROJECT_NAMES, - value, - }); - }); - - return options; - }, [projectOptions]); + return [...dynamicFilteringOptions]; + }, [dynamicFilteringOptions]); const filteringProperties = [ { @@ -54,6 +48,7 @@ export const useFilters = (localStorePrefix = 'instances-list-page') => { key: filterKeys.FLEET_IDS, operators: ['='], propertyLabel: 'Fleet ID', + groupValuesLabel: 'Fleet ID values', }, ]; @@ -70,8 +65,6 @@ export const useFilters = (localStorePrefix = 'instances-list-page') => { const onChangeOnlyActive: ToggleProps['onChange'] = ({ detail }) => { setOnlyActive(detail.checked); - - setSearchParams(tokensToSearchParams(propertyFilterQuery.tokens, detail.checked)); }; const filteringRequestParams = useMemo(() => { @@ -88,6 +81,32 @@ export const useFilters = (localStorePrefix = 'instances-list-page') => { const isDisabledClearFilter = !propertyFilterQuery.tokens.length && !onlyActive; + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setDynamicFilteringOptions([]); + + console.log({ filteringProperty, filteringText }); + + if (!filteringText.length) { + return Promise.resolve(); + } + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.PROJECT_NAMES) { + await getProjects({ name_pattern: filteringText, limit }) + .unwrap() + .then(({ data }) => + data.map(({ project_name }) => ({ + propertyKey: filterKeys.PROJECT_NAMES, + value: project_name, + })), + ) + .then(setDynamicFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + return { filteringRequestParams, clearFilter, @@ -98,5 +117,7 @@ export const useFilters = (localStorePrefix = 'instances-list-page') => { onlyActive, onChangeOnlyActive, isDisabledClearFilter, + filteringStatusType, + handleLoadItems, } as const; }; diff --git a/frontend/src/pages/Instances/List/index.tsx b/frontend/src/pages/Instances/List/index.tsx index 423ebc77f9..a0cd2be951 100644 --- a/frontend/src/pages/Instances/List/index.tsx +++ b/frontend/src/pages/Instances/List/index.tsx @@ -38,6 +38,8 @@ export const List: React.FC = () => { onlyActive, onChangeOnlyActive, isDisabledClearFilter, + filteringStatusType, + handleLoadItems, } = useFilters(); const { data, isLoading, refreshList, isLoadingMore } = useInfiniteScroll({ @@ -116,9 +118,12 @@ export const List: React.FC = () => { filteringAriaLabel: t('projects.run.filter_property_placeholder'), filteringPlaceholder: t('projects.run.filter_property_placeholder'), operationAndText: 'and', + enteredTextLabel: (value) => `Use: ${value}`, }} filteringOptions={filteringOptions} filteringProperties={filteringProperties} + filteringStatusType={filteringStatusType} + onLoadItems={handleLoadItems} /> diff --git a/frontend/src/pages/Models/List/hooks.tsx b/frontend/src/pages/Models/List/hooks.tsx index 461bf28a3b..3f1449de66 100644 --- a/frontend/src/pages/Models/List/hooks.tsx +++ b/frontend/src/pages/Models/List/hooks.tsx @@ -7,10 +7,10 @@ import type { PropertyFilterProps } from 'components'; import { Button, ListEmptyMessage, NavigateLink, TableProps } from 'components'; import { DATE_TIME_FORMAT } from 'consts'; -import { useProjectFilter } from 'hooks/useProjectFilter'; import { EMPTY_QUERY, requestParamsToTokens, tokensToRequestParams, tokensToSearchParams } from 'libs/filters'; import { ROUTES } from 'routes'; -import { useGetUserListQuery } from 'services/user'; +import { useLazyGetProjectsQuery } from 'services/project'; +import { useLazyGetUserListQuery } from 'services/user'; import { getModelGateway } from '../helpers'; @@ -126,10 +126,15 @@ const filterKeys: Record = { USER_NAME: 'username', }; -export const useFilters = (localStorePrefix = 'models-list-page') => { +const limit = 100; + +export const useFilters = () => { const [searchParams, setSearchParams] = useSearchParams(); - const { projectOptions } = useProjectFilter({ localStorePrefix }); - const { data: usersData } = useGetUserListQuery({}); + + const [filteringOptions, setFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); + const [getUsers] = useLazyGetUserListQuery(); const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => requestParamsToTokens({ searchParams, filterKeys }), @@ -140,27 +145,6 @@ export const useFilters = (localStorePrefix = 'models-list-page') => { setPropertyFilterQuery(EMPTY_QUERY); }; - const filteringOptions = useMemo(() => { - const options: PropertyFilterProps.FilteringOption[] = []; - - projectOptions.forEach(({ value }) => { - if (value) - options.push({ - propertyKey: filterKeys.PROJECT_NAME, - value, - }); - }); - - usersData?.data?.forEach(({ username }) => { - options.push({ - propertyKey: filterKeys.USER_NAME, - value: username, - }); - }); - - return options; - }, [projectOptions, usersData]); - const filteringProperties = [ { key: filterKeys.PROJECT_NAME, @@ -172,6 +156,7 @@ export const useFilters = (localStorePrefix = 'models-list-page') => { key: filterKeys.USER_NAME, operators: ['='], propertyLabel: 'User', + groupValuesLabel: 'User values', }, ]; @@ -196,6 +181,42 @@ export const useFilters = (localStorePrefix = 'models-list-page') => { }) as Partial; }, [propertyFilterQuery]); + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setFilteringOptions([]); + + if (!filteringText.length) { + return Promise.resolve(); + } + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.PROJECT_NAME) { + await getProjects({ name_pattern: filteringText, limit }) + .unwrap() + .then(({ data }) => + data.map(({ project_name }) => ({ + propertyKey: filterKeys.PROJECT_NAME, + value: project_name, + })), + ) + .then(setFilteringOptions); + } + + if (filteringProperty?.key === filterKeys.USER_NAME) { + await getUsers({ name_pattern: filteringText, limit }) + .unwrap() + .then(({ data }) => + data.map(({ username }) => ({ + propertyKey: filterKeys.USER_NAME, + value: username, + })), + ) + .then(setFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + return { filteringRequestParams, clearFilter, @@ -203,5 +224,7 @@ export const useFilters = (localStorePrefix = 'models-list-page') => { onChangePropertyFilter, filteringOptions, filteringProperties, + filteringStatusType, + handleLoadItems, } as const; }; diff --git a/frontend/src/pages/Models/List/index.tsx b/frontend/src/pages/Models/List/index.tsx index f0dffa4cf7..769c8bc105 100644 --- a/frontend/src/pages/Models/List/index.tsx +++ b/frontend/src/pages/Models/List/index.tsx @@ -26,6 +26,8 @@ export const List: React.FC = () => { filteringOptions, filteringProperties, filteringRequestParams, + filteringStatusType, + handleLoadItems, } = useFilters(); useBreadcrumbs([ @@ -98,9 +100,12 @@ export const List: React.FC = () => { filteringAriaLabel: t('projects.run.filter_property_placeholder'), filteringPlaceholder: t('projects.run.filter_property_placeholder'), operationAndText: 'and', + enteredTextLabel: (value) => `Use: ${value}`, }} filteringOptions={filteringOptions} filteringProperties={filteringProperties} + filteringStatusType={filteringStatusType} + onLoadItems={handleLoadItems} /> diff --git a/frontend/src/pages/Offers/List/hooks/useFilters.ts b/frontend/src/pages/Offers/List/hooks/useFilters.ts index 20c95402c0..b44cdfcee0 100644 --- a/frontend/src/pages/Offers/List/hooks/useFilters.ts +++ b/frontend/src/pages/Offers/List/hooks/useFilters.ts @@ -3,7 +3,6 @@ import { useSearchParams } from 'react-router-dom'; import type { MultiselectProps, PropertyFilterProps } from 'components'; -import { useProjectFilter } from 'hooks/useProjectFilter'; import { EMPTY_QUERY, requestParamsToArray, @@ -11,6 +10,7 @@ import { tokensToRequestParams, tokensToSearchParams, } from 'libs/filters'; +import { useGetProjectsQuery, useLazyGetProjectsQuery } from 'services/project'; import { getPropertyFilterOptions } from '../helpers'; @@ -54,43 +54,51 @@ const filteringProperties = [ key: filterKeys.PROJECT_NAME, operators: ['='], propertyLabel: 'Project', + groupValuesLabel: 'Project values', }, { key: filterKeys.GPU_NAME, operators: ['='], propertyLabel: 'GPU name', + groupValuesLabel: 'GPU name values', }, { key: filterKeys.GPU_COUNT, operators: ['<=', '>='], propertyLabel: 'GPU count', + groupValuesLabel: 'GPU count values', }, { key: filterKeys.GPU_MEMORY, operators: ['<=', '>='], propertyLabel: 'GPU memory', + groupValuesLabel: 'GPU memory values', }, { key: filterKeys.BACKEND, operators: ['='], propertyLabel: 'Backend', + groupValuesLabel: 'Backend values', }, { key: filterKeys.SPOT_POLICY, operators: ['='], propertyLabel: 'Spot policy', + groupValuesLabel: 'Spot policy values', }, ]; const gpuFilterOption = { label: 'GPU', value: 'gpu' }; - const defaultGroupByOptions = [{ ...gpuFilterOption }, { label: 'Backend', value: 'backend' }]; - const groupByRequestParamName: RequestParamsKeys = 'group_by'; +const limit = 100; export const useFilters = ({ gpus, withSearchParams = true, permanentFilters = {}, defaultFilters }: UseFiltersArgs) => { const [searchParams, setSearchParams] = useSearchParams(); - const { projectOptions } = useProjectFilter({ localStorePrefix: 'offers-list-projects' }); + const [dynamicFilteringOptions, setDynamicFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); + const { data: projectsData } = useGetProjectsQuery({ limit: 1 }); const projectNameIsChecked = useRef(false); const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => @@ -119,18 +127,10 @@ export const useFilters = ({ gpus, withSearchParams = true, permanentFilters = { }; const filteringOptions = useMemo(() => { - const options: PropertyFilterProps.FilteringOption[] = [...spotPolicyOptions]; + const options: PropertyFilterProps.FilteringOption[] = [...spotPolicyOptions, ...dynamicFilteringOptions]; const { names, backends } = getPropertyFilterOptions(gpus); - projectOptions.forEach(({ value }) => { - if (value) - options.push({ - propertyKey: filterKeys.PROJECT_NAME, - value, - }); - }); - Array.from(names).forEach((name) => { options.push({ propertyKey: filterKeys.GPU_NAME, @@ -146,7 +146,7 @@ export const useFilters = ({ gpus, withSearchParams = true, permanentFilters = { }); return options; - }, [gpus]); + }, [gpus, dynamicFilteringOptions]); const groupByOptions: MultiselectProps.Options = useMemo(() => { return defaultGroupByOptions.map((option) => { @@ -243,8 +243,32 @@ export const useFilters = ({ gpus, withSearchParams = true, permanentFilters = { }; }, [propertyFilterQuery, permanentFilters]); + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setDynamicFilteringOptions([]); + + if (!filteringText.length) { + return Promise.resolve(); + } + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.PROJECT_NAME) { + await getProjects({ name_pattern: filteringText, limit }) + .unwrap() + .then(({ data }) => + data.map(({ project_name }) => ({ + propertyKey: filterKeys.PROJECT_NAME, + value: project_name, + })), + ) + .then(setDynamicFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + useEffect(() => { - if (!projectNameIsChecked.current && projectOptions.length) { + if (!projectNameIsChecked.current && projectsData?.data?.length) { projectNameIsChecked.current = true; if (!filteringRequestParams['project_name']) { @@ -254,14 +278,14 @@ export const useFilters = ({ gpus, withSearchParams = true, permanentFilters = { { operator: '=', propertyKey: filterKeys.PROJECT_NAME, - value: projectOptions[0].value, + value: projectsData.data[0].project_name, }, ], operation: 'and', }); } } - }, [projectOptions]); + }, [projectsData]); return { filteringRequestParams, @@ -273,5 +297,7 @@ export const useFilters = ({ gpus, withSearchParams = true, permanentFilters = { groupBy, groupByOptions, onChangeGroupBy, + filteringStatusType, + handleLoadItems, } as const; }; diff --git a/frontend/src/pages/Offers/List/index.tsx b/frontend/src/pages/Offers/List/index.tsx index c594e67f60..a44dbd48f4 100644 --- a/frontend/src/pages/Offers/List/index.tsx +++ b/frontend/src/pages/Offers/List/index.tsx @@ -105,6 +105,8 @@ export const OfferList: React.FC = ({ groupBy, groupByOptions, onChangeGroupBy, + filteringStatusType, + handleLoadItems, } = useFilters({ gpus: data?.gpus ?? [], withSearchParams, permanentFilters, defaultFilters }); useEffect(() => { @@ -239,38 +241,43 @@ export const OfferList: React.FC = ({ loading={!disabled && (isLoading || isFetching)} loadingText={t('common.loading')} stickyHeader={true} - filter={disabled ? undefined : ( -
    -
    - -
    + filter={ + disabled ? undefined : ( +
    +
    + `Use: ${value}`, + }} + filteringOptions={filteringOptions} + filteringProperties={filteringProperties} + filteringStatusType={filteringStatusType} + onLoadItems={handleLoadItems} + /> +
    -
    - +
    + +
    -
    - )} + ) + } /> ); }; diff --git a/frontend/src/pages/Project/Details/Events/index.tsx b/frontend/src/pages/Project/Details/Events/index.tsx index df18300c52..f01186cecc 100644 --- a/frontend/src/pages/Project/Details/Events/index.tsx +++ b/frontend/src/pages/Project/Details/Events/index.tsx @@ -2,10 +2,11 @@ import React from 'react'; import { useTranslation } from 'react-i18next'; import { useNavigate, useParams } from 'react-router-dom'; -import { Button, Header, SpaceBetween } from 'components'; +import { Button, Container, Header, Loader, SpaceBetween } from 'components'; import { useBreadcrumbs } from 'hooks'; import { ROUTES } from 'routes'; +import { useGetProjectQuery } from 'services/project'; import { EventList } from 'pages/Events/List'; @@ -14,6 +15,7 @@ export const Events: React.FC = () => { const params = useParams(); const paramProjectName = params.projectName ?? ''; const navigate = useNavigate(); + const { data, isLoading } = useGetProjectQuery({ name: paramProjectName }); useBreadcrumbs([ { @@ -31,9 +33,16 @@ export const Events: React.FC = () => { ]); const goToEventsPage = () => { - navigate(ROUTES.EVENTS.LIST + `?within_projects=${paramProjectName}`); + navigate(ROUTES.EVENTS.LIST + `?within_projects=${data?.project_id}`); }; + if (isLoading || !data) + return ( + + + + ); + return ( { @@ -48,7 +57,7 @@ export const Events: React.FC = () => { /> ); }} - permanentFilters={{ within_projects: [paramProjectName] }} + permanentFilters={{ within_projects: [data.project_id] }} showFilters={false} /> ); diff --git a/frontend/src/pages/Runs/List/hooks/useFilters.ts b/frontend/src/pages/Runs/List/hooks/useFilters.ts index 82f1ca40bd..c1af161607 100644 --- a/frontend/src/pages/Runs/List/hooks/useFilters.ts +++ b/frontend/src/pages/Runs/List/hooks/useFilters.ts @@ -4,13 +4,10 @@ import { ToggleProps } from '@cloudscape-design/components'; import type { PropertyFilterProps } from 'components'; -import { useProjectFilter } from 'hooks/useProjectFilter'; +import { useLocalStorageState } from 'hooks'; import { EMPTY_QUERY, requestParamsToTokens, tokensToRequestParams, tokensToSearchParams } from 'libs/filters'; -import { useGetUserListQuery } from 'services/user'; - -type Args = { - localStorePrefix: string; -}; +import { useLazyGetProjectsQuery } from 'services/project'; +import { useLazyGetUserListQuery } from 'services/user'; type RequestParamsKeys = keyof Pick; @@ -19,11 +16,15 @@ const filterKeys: Record = { USER_NAME: 'username', }; -export const useFilters = ({ localStorePrefix }: Args) => { +const limit = 100; + +export const useFilters = () => { const [searchParams, setSearchParams] = useSearchParams(); - const [onlyActive, setOnlyActive] = useState(() => searchParams.get('only_active') === 'true'); - const { projectOptions } = useProjectFilter({ localStorePrefix }); - const { data: usersData } = useGetUserListQuery({}); + const [onlyActive, setOnlyActive] = useLocalStorageState('run-list-filter-only-active', true); + const [filteringOptions, setFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); + const [getUsers] = useLazyGetUserListQuery(); const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => requestParamsToTokens({ searchParams, filterKeys }), @@ -31,44 +32,26 @@ export const useFilters = ({ localStorePrefix }: Args) => { const clearFilter = () => { setSearchParams({}); - setOnlyActive(false); setPropertyFilterQuery(EMPTY_QUERY); }; - const filteringOptions = useMemo(() => { - const options: PropertyFilterProps.FilteringOption[] = []; - - projectOptions.forEach(({ value }) => { - if (value) - options.push({ - propertyKey: filterKeys.PROJECT_NAME, - value, - }); - }); - - usersData?.data?.forEach(({ username }) => { - options.push({ - propertyKey: filterKeys.USER_NAME, - value: username, - }); - }); - - return options; - }, [projectOptions, usersData]); - - const filteringProperties = [ - { - key: filterKeys.PROJECT_NAME, - operators: ['='], - propertyLabel: 'Project', - groupValuesLabel: 'Project values', - }, - { - key: filterKeys.USER_NAME, - operators: ['='], - propertyLabel: 'User', - }, - ]; + const filteringProperties = useMemo( + () => [ + { + key: filterKeys.PROJECT_NAME, + operators: ['='], + propertyLabel: 'Project', + groupValuesLabel: 'Project values', + }, + { + key: filterKeys.USER_NAME, + operators: ['='], + propertyLabel: 'User', + groupValuesLabel: 'User values', + }, + ], + [], + ); const onChangePropertyFilter: PropertyFilterProps['onChange'] = ({ detail }) => { const { tokens, operation } = detail; @@ -87,8 +70,6 @@ export const useFilters = ({ localStorePrefix }: Args) => { const onChangeOnlyActive: ToggleProps['onChange'] = ({ detail }) => { setOnlyActive(detail.checked); - - setSearchParams(tokensToSearchParams(propertyFilterQuery.tokens, detail.checked)); }; const filteringRequestParams = useMemo(() => { @@ -102,6 +83,42 @@ export const useFilters = ({ localStorePrefix }: Args) => { } as Partial; }, [propertyFilterQuery, onlyActive]); + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setFilteringOptions([]); + + if (!filteringText.length) { + return Promise.resolve(); + } + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.PROJECT_NAME) { + await getProjects({ name_pattern: filteringText, limit }) + .unwrap() + .then(({ data }) => + data.map(({ project_name }) => ({ + propertyKey: filterKeys.PROJECT_NAME, + value: project_name, + })), + ) + .then(setFilteringOptions); + } + + if (filteringProperty?.key === filterKeys.USER_NAME) { + await getUsers({ name_pattern: filteringText, limit }) + .unwrap() + .then(({ data }) => + data.map(({ username }) => ({ + propertyKey: filterKeys.USER_NAME, + value: username, + })), + ) + .then(setFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + return { filteringRequestParams, clearFilter, @@ -111,5 +128,7 @@ export const useFilters = ({ localStorePrefix }: Args) => { filteringProperties, onlyActive, onChangeOnlyActive, + filteringStatusType, + handleLoadItems, } as const; }; diff --git a/frontend/src/pages/Runs/List/index.tsx b/frontend/src/pages/Runs/List/index.tsx index 4cd69ffd41..ad4c63ef94 100644 --- a/frontend/src/pages/Runs/List/index.tsx +++ b/frontend/src/pages/Runs/List/index.tsx @@ -47,9 +47,9 @@ export const RunList: React.FC = () => { filteringRequestParams, onlyActive, onChangeOnlyActive, - } = useFilters({ - localStorePrefix: 'administration-run-list-page', - }); + filteringStatusType, + handleLoadItems, + } = useFilters(); const projectHavingFleetMap = useCheckingForFleetsInProjects({}); @@ -188,9 +188,12 @@ export const RunList: React.FC = () => { filteringAriaLabel: t('projects.run.filter_property_placeholder'), filteringPlaceholder: t('projects.run.filter_property_placeholder'), operationAndText: 'and', + enteredTextLabel: (value) => `Use: ${value}`, }} filteringOptions={filteringOptions} filteringProperties={filteringProperties} + filteringStatusType={filteringStatusType} + onLoadItems={handleLoadItems} />
    diff --git a/frontend/src/pages/User/Details/Events/index.tsx b/frontend/src/pages/User/Details/Events/index.tsx index 3141d6f33a..be5c174208 100644 --- a/frontend/src/pages/User/Details/Events/index.tsx +++ b/frontend/src/pages/User/Details/Events/index.tsx @@ -2,10 +2,11 @@ import React, { useState } from 'react'; import { useTranslation } from 'react-i18next'; import { useNavigate, useParams } from 'react-router-dom'; -import { Button, Header, SegmentedControl, SpaceBetween } from 'components'; +import { Button, Container, Header, Loader, SegmentedControl, SpaceBetween } from 'components'; import { useBreadcrumbs } from 'hooks'; import { ROUTES } from 'routes'; +import { useGetUserQuery } from 'services/user'; import { EventList } from 'pages/Events/List'; @@ -15,6 +16,7 @@ export const Events: React.FC = () => { const paramUserName = params.userName ?? ''; const navigate = useNavigate(); const [filterParamName, setFilterParamName] = useState('actors'); + const { data, isLoading } = useGetUserQuery({ name: paramUserName }); useBreadcrumbs([ { @@ -32,9 +34,16 @@ export const Events: React.FC = () => { ]); const goToEventsPage = () => { - navigate(ROUTES.EVENTS.LIST + `?${filterParamName}=${paramUserName}`); + navigate(ROUTES.EVENTS.LIST + `?${filterParamName}=${data?.id}`); }; + if (isLoading || !data) + return ( + + + + ); + return ( { @@ -57,7 +66,7 @@ export const Events: React.FC = () => { /> ); }} - permanentFilters={{ [filterParamName]: [paramUserName] }} + permanentFilters={{ [filterParamName]: [data.id] }} showFilters={false} /> ); diff --git a/frontend/src/pages/Volumes/List/hooks.tsx b/frontend/src/pages/Volumes/List/hooks.tsx index ce73b3a94b..a969cfc878 100644 --- a/frontend/src/pages/Volumes/List/hooks.tsx +++ b/frontend/src/pages/Volumes/List/hooks.tsx @@ -8,12 +8,12 @@ import type { PropertyFilterProps } from 'components'; import { Button, ListEmptyMessage, NavigateLink, StatusIndicator } from 'components'; import { DATE_TIME_FORMAT } from 'consts'; -import { useNotifications } from 'hooks'; -import { useProjectFilter } from 'hooks/useProjectFilter'; +import { useLocalStorageState, useNotifications } from 'hooks'; import { getServerError } from 'libs'; import { EMPTY_QUERY, requestParamsToTokens, tokensToRequestParams, tokensToSearchParams } from 'libs/filters'; import { getStatusIconType } from 'libs/volumes'; import { ROUTES } from 'routes'; +import { useLazyGetProjectsQuery } from 'services/project'; import { useDeleteVolumesMutation } from 'services/volume'; export const useVolumesTableEmptyMessages = ({ @@ -122,10 +122,14 @@ const filterKeys: Record = { PROJECT_NAME: 'project_name', }; -export const useFilters = (localStorePrefix = 'volume-list-page') => { +const limit = 100; + +export const useFilters = () => { const [searchParams, setSearchParams] = useSearchParams(); - const [onlyActive, setOnlyActive] = useState(() => searchParams.get('only_active') === 'true'); - const { projectOptions } = useProjectFilter({ localStorePrefix }); + const [onlyActive, setOnlyActive] = useLocalStorageState('volume-list-filter-only-active', true); + const [dynamicFilteringOptions, setDynamicFilteringOptions] = useState([]); + const [filteringStatusType, setFilteringStatusType] = useState(); + const [getProjects] = useLazyGetProjectsQuery(); const [propertyFilterQuery, setPropertyFilterQuery] = useState(() => requestParamsToTokens({ searchParams, filterKeys }), @@ -133,25 +137,14 @@ export const useFilters = (localStorePrefix = 'volume-list-page') => { const clearFilter = () => { setSearchParams({}); - setOnlyActive(false); setPropertyFilterQuery(EMPTY_QUERY); }; const isDisabledClearFilter = !propertyFilterQuery.tokens.length && !onlyActive; - const filteringOptions = useMemo(() => { - const options: PropertyFilterProps.FilteringOption[] = []; - - projectOptions.forEach(({ value }) => { - if (value) - options.push({ - propertyKey: filterKeys.PROJECT_NAME, - value, - }); - }); - - return options; - }, [projectOptions]); + const filteringOptions = useMemo(() => { + return [...dynamicFilteringOptions]; + }, [dynamicFilteringOptions]); const filteringProperties = [ { @@ -179,8 +172,6 @@ export const useFilters = (localStorePrefix = 'volume-list-page') => { const onChangeOnlyActive: ToggleProps['onChange'] = ({ detail }) => { setOnlyActive(detail.checked); - - setSearchParams(tokensToSearchParams(propertyFilterQuery.tokens, detail.checked)); }; const filteringRequestParams = useMemo(() => { @@ -194,6 +185,30 @@ export const useFilters = (localStorePrefix = 'volume-list-page') => { } as Partial; }, [propertyFilterQuery, onlyActive]); + const handleLoadItems: PropertyFilterProps['onLoadItems'] = async ({ detail: { filteringProperty, filteringText } }) => { + setDynamicFilteringOptions([]); + + if (!filteringText.length) { + return Promise.resolve(); + } + + setFilteringStatusType('loading'); + + if (filteringProperty?.key === filterKeys.PROJECT_NAME) { + await getProjects({ name_pattern: filteringText, limit }) + .unwrap() + .then(({ data }) => + data.map(({ project_name }) => ({ + propertyKey: filterKeys.PROJECT_NAME, + value: project_name, + })), + ) + .then(setDynamicFilteringOptions); + } + + setFilteringStatusType(undefined); + }; + return { filteringRequestParams, clearFilter, @@ -204,6 +219,8 @@ export const useFilters = (localStorePrefix = 'volume-list-page') => { onlyActive, onChangeOnlyActive, isDisabledClearFilter, + filteringStatusType, + handleLoadItems, } as const; }; diff --git a/frontend/src/pages/Volumes/List/index.tsx b/frontend/src/pages/Volumes/List/index.tsx index 15229d772e..d51b9f1d5e 100644 --- a/frontend/src/pages/Volumes/List/index.tsx +++ b/frontend/src/pages/Volumes/List/index.tsx @@ -24,6 +24,8 @@ export const VolumeList: React.FC = () => { onlyActive, onChangeOnlyActive, isDisabledClearFilter, + filteringStatusType, + handleLoadItems, } = useFilters(); const { isDeleting, deleteVolumes } = useVolumesDelete(); @@ -125,9 +127,12 @@ export const VolumeList: React.FC = () => { filteringAriaLabel: t('projects.run.filter_property_placeholder'), filteringPlaceholder: t('projects.run.filter_property_placeholder'), operationAndText: 'and', + enteredTextLabel: (value) => `Use: ${value}`, }} filteringOptions={filteringOptions} filteringProperties={filteringProperties} + filteringStatusType={filteringStatusType} + onLoadItems={handleLoadItems} /> From bfe44d3220931d6decaa541efecb68d4a7ad8bcf Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Thu, 5 Mar 2026 08:48:35 +0000 Subject: [PATCH 184/187] Fleet sharing main mechanisms (#3629) - DB schema for resource exports and imports - Submitting jobs to imported fleets - Viewing imported fleets and instances in API, CLI, UI - Filtering events by imported fleets and instances Currently testable through unit tests and through exports and imports manually created in the DB. --- frontend/src/pages/Fleets/List/hooks.tsx | 1 + .../pages/Instances/List/hooks/useFilters.ts | 1 + frontend/src/types/fleet.d.ts | 1 + frontend/src/types/instance.d.ts | 1 + src/dstack/_internal/cli/commands/fleet.py | 4 +- .../03_04_2221_5e8c7a9202bc_add_exports.py | 118 ++++ src/dstack/_internal/server/models.py | 60 ++ src/dstack/_internal/server/routers/fleets.py | 27 +- .../_internal/server/routers/instances.py | 15 +- src/dstack/_internal/server/schemas/fleets.py | 15 + .../_internal/server/schemas/instances.py | 1 + .../_internal/server/security/permissions.py | 68 +- .../_internal/server/services/events.py | 18 +- .../_internal/server/services/fleets.py | 77 ++- .../_internal/server/services/instances.py | 23 +- .../server/services/jobs/__init__.py | 5 +- .../_internal/server/services/runs/plan.py | 30 +- src/dstack/_internal/server/testing/common.py | 21 + src/dstack/_internal/utils/common.py | 18 +- src/dstack/api/server/_fleets.py | 6 +- .../scheduled_tasks/test_submitted_jobs.py | 105 +++ .../_internal/server/routers/test_events.py | 228 +++++++ .../_internal/server/routers/test_fleets.py | 624 ++++++++++++++++++ .../server/routers/test_instances.py | 326 +++++++++ .../_internal/server/routers/test_runs.py | 52 ++ 25 files changed, 1798 insertions(+), 47 deletions(-) create mode 100644 src/dstack/_internal/server/migrations/versions/2026/03_04_2221_5e8c7a9202bc_add_exports.py diff --git a/frontend/src/pages/Fleets/List/hooks.tsx b/frontend/src/pages/Fleets/List/hooks.tsx index bfff69d0ec..a725068e20 100644 --- a/frontend/src/pages/Fleets/List/hooks.tsx +++ b/frontend/src/pages/Fleets/List/hooks.tsx @@ -180,6 +180,7 @@ export const useFilters = () => { return { ...params, only_active: onlyActive, + include_imported: true, } as Partial; }, [propertyFilterQuery, onlyActive]); diff --git a/frontend/src/pages/Instances/List/hooks/useFilters.ts b/frontend/src/pages/Instances/List/hooks/useFilters.ts index bb3a5286bc..c7dfcecf5d 100644 --- a/frontend/src/pages/Instances/List/hooks/useFilters.ts +++ b/frontend/src/pages/Instances/List/hooks/useFilters.ts @@ -76,6 +76,7 @@ export const useFilters = () => { return { ...params, only_active: onlyActive, + include_imported: true, } as Partial; }, [propertyFilterQuery, onlyActive]); diff --git a/frontend/src/types/fleet.d.ts b/frontend/src/types/fleet.d.ts index 2813cd4023..b5050167b3 100644 --- a/frontend/src/types/fleet.d.ts +++ b/frontend/src/types/fleet.d.ts @@ -3,6 +3,7 @@ declare type TSpotPolicy = 'spot' | 'on-demand' | 'auto'; declare type TFleetListRequestParams = TBaseRequestListParams & { project_name?: string; only_active?: boolean; + include_imported?: boolean; }; declare interface ISSHHostParamsRequest { diff --git a/frontend/src/types/instance.d.ts b/frontend/src/types/instance.d.ts index 585f4f5093..555e355dae 100644 --- a/frontend/src/types/instance.d.ts +++ b/frontend/src/types/instance.d.ts @@ -2,6 +2,7 @@ declare type TInstanceListRequestParams = TBaseRequestListParams & { project_names?: string[]; fleet_ids?: string[]; only_active?: boolean; + include_imported?: boolean; }; declare type TInstanceStatus = diff --git a/src/dstack/_internal/cli/commands/fleet.py b/src/dstack/_internal/cli/commands/fleet.py index d58f50cc2a..4e9e09a3fc 100644 --- a/src/dstack/_internal/cli/commands/fleet.py +++ b/src/dstack/_internal/cli/commands/fleet.py @@ -93,7 +93,7 @@ def _command(self, args: argparse.Namespace): args.subfunc(args) def _list(self, args: argparse.Namespace): - fleets = self.api.client.fleets.list(self.api.project) + fleets = self.api.client.fleets.list(self.api.project, include_imported=True) if not args.watch: print_fleets_table(fleets, current_project=self.api.project, verbose=args.verbose) return @@ -107,7 +107,7 @@ def _list(self, args: argparse.Namespace): ) ) time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS) - fleets = self.api.client.fleets.list(self.api.project) + fleets = self.api.client.fleets.list(self.api.project, include_imported=True) except KeyboardInterrupt: pass diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_04_2221_5e8c7a9202bc_add_exports.py b/src/dstack/_internal/server/migrations/versions/2026/03_04_2221_5e8c7a9202bc_add_exports.py new file mode 100644 index 0000000000..05a022f7ff --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_04_2221_5e8c7a9202bc_add_exports.py @@ -0,0 +1,118 @@ +"""Add exports + +Revision ID: 5e8c7a9202bc +Revises: 46150101edec +Create Date: 2026-03-04 22:21:54.971260+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "5e8c7a9202bc" +down_revision = "46150101edec" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "exports", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("name", sa.String(length=100), nullable=False), + sa.Column( + "project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False + ), + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.ForeignKeyConstraint( + ["project_id"], + ["projects.id"], + name=op.f("fk_exports_project_id_projects"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_exports")), + sa.UniqueConstraint("project_id", "name", name="uq_exports_project_id_name"), + ) + with op.batch_alter_table("exports", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_exports_project_id"), ["project_id"], unique=False) + + op.create_table( + "exported_fleets", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("export_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("fleet_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.ForeignKeyConstraint( + ["export_id"], + ["exports.id"], + name=op.f("fk_exported_fleets_export_id_exports"), + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["fleet_id"], + ["fleets.id"], + name=op.f("fk_exported_fleets_fleet_id_fleets"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_exported_fleets")), + sa.UniqueConstraint("export_id", "fleet_id", name="uq_exported_fleets_export_id_fleet_id"), + ) + with op.batch_alter_table("exported_fleets", schema=None) as batch_op: + batch_op.create_index( + batch_op.f("ix_exported_fleets_export_id"), ["export_id"], unique=False + ) + batch_op.create_index( + batch_op.f("ix_exported_fleets_fleet_id"), ["fleet_id"], unique=False + ) + + op.create_table( + "imports", + sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column( + "project_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False + ), + sa.Column("export_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False), + sa.Column("created_at", dstack._internal.server.models.NaiveDateTime(), nullable=False), + sa.ForeignKeyConstraint( + ["export_id"], + ["exports.id"], + name=op.f("fk_imports_export_id_exports"), + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["project_id"], + ["projects.id"], + name=op.f("fk_imports_project_id_projects"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_imports")), + sa.UniqueConstraint("project_id", "export_id", name="uq_imports_project_id_export_id"), + ) + with op.batch_alter_table("imports", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_imports_export_id"), ["export_id"], unique=False) + batch_op.create_index(batch_op.f("ix_imports_project_id"), ["project_id"], unique=False) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("imports", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_imports_project_id")) + batch_op.drop_index(batch_op.f("ix_imports_export_id")) + + op.drop_table("imports") + with op.batch_alter_table("exported_fleets", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_exported_fleets_fleet_id")) + batch_op.drop_index(batch_op.f("ix_exported_fleets_export_id")) + + op.drop_table("exported_fleets") + with op.batch_alter_table("exports", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_exports_project_id")) + + op.drop_table("exports") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index 15c5488da5..15801a25df 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -978,3 +978,63 @@ class EventTargetModel(BaseModel): ) entity_id: Mapped[uuid.UUID] = mapped_column(UUIDType(binary=False), index=True) entity_name: Mapped[str] = mapped_column(String(200)) + + +class ExportModel(BaseModel): + __tablename__ = "exports" + __table_args__ = (UniqueConstraint("project_id", "name", name="uq_exports_project_id_name"),) + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + name: Mapped[str] = mapped_column(String(100)) + project_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("projects.id", ondelete="CASCADE"), index=True + ) + project: Mapped["ProjectModel"] = relationship() + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + imports: Mapped[List["ImportModel"]] = relationship(back_populates="export") + exported_fleets: Mapped[List["ExportedFleetModel"]] = relationship(back_populates="export") + + +class ImportModel(BaseModel): + __tablename__ = "imports" + __table_args__ = ( + UniqueConstraint( + "project_id", + "export_id", + name="uq_imports_project_id_export_id", + ), + ) + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + project_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("projects.id", ondelete="CASCADE"), index=True + ) + project: Mapped["ProjectModel"] = relationship() + export_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("exports.id", ondelete="CASCADE"), index=True + ) + export: Mapped["ExportModel"] = relationship() + created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) + + +class ExportedFleetModel(BaseModel): + __tablename__ = "exported_fleets" + __table_args__ = ( + UniqueConstraint("export_id", "fleet_id", name="uq_exported_fleets_export_id_fleet_id"), + ) + + id: Mapped[uuid.UUID] = mapped_column( + UUIDType(binary=False), primary_key=True, default=uuid.uuid4 + ) + export_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("exports.id", ondelete="CASCADE"), index=True + ) + export: Mapped["ExportModel"] = relationship() + fleet_id: Mapped[uuid.UUID] = mapped_column( + ForeignKey("fleets.id", ondelete="CASCADE"), index=True + ) + fleet: Mapped["FleetModel"] = relationship() diff --git a/src/dstack/_internal/server/routers/fleets.py b/src/dstack/_internal/server/routers/fleets.py index a436d1123a..cb18db8bbd 100644 --- a/src/dstack/_internal/server/routers/fleets.py +++ b/src/dstack/_internal/server/routers/fleets.py @@ -9,6 +9,7 @@ from dstack._internal.core.models.fleets import Fleet, FleetPlan from dstack._internal.server.compatibility.common import patch_offers_list from dstack._internal.server.db import get_session +from dstack._internal.server.deps import Project from dstack._internal.server.models import ProjectModel, UserModel from dstack._internal.server.schemas.fleets import ( ApplyFleetPlanRequest, @@ -18,8 +19,13 @@ GetFleetPlanRequest, GetFleetRequest, ListFleetsRequest, + ListProjectFleetsRequest, +) +from dstack._internal.server.security.permissions import ( + Authenticated, + ProjectMember, + check_can_access_fleet, ) -from dstack._internal.server.security.permissions import Authenticated, ProjectMember from dstack._internal.server.utils.routers import ( CustomORJSONResponse, get_base_api_additional_responses, @@ -58,6 +64,7 @@ async def list_fleets( user=user, project_name=body.project_name, only_active=body.only_active, + include_imported=body.include_imported, prev_created_at=body.prev_created_at, prev_id=body.prev_id, limit=body.limit, @@ -68,6 +75,7 @@ async def list_fleets( @project_router.post("/list", response_model=List[Fleet]) async def list_project_fleets( + body: Optional[ListProjectFleetsRequest] = None, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), ): @@ -76,8 +84,14 @@ async def list_project_fleets( Includes only active fleet instances. To list all fleet instances, use `/api/instances/list`. """ _, project = user_project + if body is None: + body = ListProjectFleetsRequest() return CustomORJSONResponse( - await fleets_services.list_project_fleets(session=session, project=project) + await fleets_services.list_project_fleets( + session=session, + project=project, + include_imported=body.include_imported, + ) ) @@ -85,16 +99,19 @@ async def list_project_fleets( async def get_fleet( body: GetFleetRequest, session: AsyncSession = Depends(get_session), - user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), + user: UserModel = Depends(Authenticated()), + project: ProjectModel = Depends(Project()), ): """ Returns a fleet given `name` or `id`. If given `name`, does not return deleted fleets. If given `id`, returns deleted fleets. """ - _, project = user_project + await check_can_access_fleet( + session=session, user=user, fleet_project=project, fleet_name_or_id=body.get_name_or_id() + ) fleet = await fleets_services.get_fleet( - session=session, project=project, name=body.name, fleet_id=body.id + session=session, project=project, name_or_id=body.get_name_or_id() ) if fleet is None: raise ResourceNotExistsError() diff --git a/src/dstack/_internal/server/routers/instances.py b/src/dstack/_internal/server/routers/instances.py index b241d7e764..1b64ec8b82 100644 --- a/src/dstack/_internal/server/routers/instances.py +++ b/src/dstack/_internal/server/routers/instances.py @@ -7,6 +7,7 @@ from dstack._internal.core.errors import ResourceNotExistsError from dstack._internal.core.models.instances import Instance from dstack._internal.server.db import get_session +from dstack._internal.server.deps import Project from dstack._internal.server.models import ProjectModel, UserModel from dstack._internal.server.schemas.instances import ( GetInstanceHealthChecksRequest, @@ -14,7 +15,11 @@ GetInstanceRequest, ListInstancesRequest, ) -from dstack._internal.server.security.permissions import Authenticated, ProjectMember +from dstack._internal.server.security.permissions import ( + Authenticated, + ProjectMember, + check_can_access_instance, +) from dstack._internal.server.utils.routers import ( CustomORJSONResponse, get_base_api_additional_responses, @@ -52,6 +57,7 @@ async def list_instances( project_names=body.project_names, fleet_ids=body.fleet_ids, only_active=body.only_active, + include_imported=body.include_imported, prev_created_at=body.prev_created_at, prev_id=body.prev_id, limit=body.limit, @@ -83,12 +89,15 @@ async def get_instance_health_checks( async def get_instance( body: GetInstanceRequest, session: Annotated[AsyncSession, Depends(get_session)], - user_project: Annotated[tuple[UserModel, ProjectModel], Depends(ProjectMember())], + user: Annotated[UserModel, Depends(Authenticated())], + project: Annotated[ProjectModel, Depends(Project())], ): """ Returns an instance given its ID. """ - _, project = user_project + await check_can_access_instance( + session=session, user=user, instance_project=project, instance_id=body.id + ) instance = await instances_services.get_instance( session=session, project=project, instance_id=body.id ) diff --git a/src/dstack/_internal/server/schemas/fleets.py b/src/dstack/_internal/server/schemas/fleets.py index 3df43d12ce..4bb25d50bb 100644 --- a/src/dstack/_internal/server/schemas/fleets.py +++ b/src/dstack/_internal/server/schemas/fleets.py @@ -4,23 +4,38 @@ from pydantic import Field +from dstack._internal.core.errors import ServerClientError from dstack._internal.core.models.common import CoreModel from dstack._internal.core.models.fleets import ApplyFleetPlanInput, FleetSpec +from dstack._internal.utils.common import EntityID, EntityName, EntityNameOrID class ListFleetsRequest(CoreModel): project_name: Optional[str] = None only_active: bool = False + include_imported: bool = False prev_created_at: Optional[datetime] = None prev_id: Optional[UUID] = None limit: int = Field(100, ge=0, le=100) ascending: bool = False +class ListProjectFleetsRequest(CoreModel): + include_imported: bool = False + + class GetFleetRequest(CoreModel): name: Optional[str] id: Optional[UUID] = None + def get_name_or_id(self) -> EntityNameOrID: + if self.id is not None: + return EntityID(id=self.id) + elif self.name is not None: + return EntityName(name=self.name) + else: + raise ServerClientError("name or id must be specified") + class GetFleetPlanRequest(CoreModel): spec: FleetSpec diff --git a/src/dstack/_internal/server/schemas/instances.py b/src/dstack/_internal/server/schemas/instances.py index 120ff161dc..8f87935b92 100644 --- a/src/dstack/_internal/server/schemas/instances.py +++ b/src/dstack/_internal/server/schemas/instances.py @@ -15,6 +15,7 @@ class ListInstancesRequest(CoreModel): project_names: Optional[list[str]] = None fleet_ids: Optional[list[UUID]] = None only_active: bool = False + include_imported: bool = False prev_created_at: Optional[datetime] = None prev_id: Optional[UUID] = None limit: int = 1000 diff --git a/src/dstack/_internal/server/security/permissions.py b/src/dstack/_internal/server/security/permissions.py index 0ecddf1d9e..107e526d30 100644 --- a/src/dstack/_internal/server/security/permissions.py +++ b/src/dstack/_internal/server/security/permissions.py @@ -1,13 +1,23 @@ from typing import Annotated, Optional, Tuple +from uuid import UUID from fastapi import Depends, HTTPException, Security from fastapi.security import HTTPBearer from fastapi.security.http import HTTPAuthorizationCredentials +from sqlalchemy import exists, func, select from sqlalchemy.ext.asyncio import AsyncSession from dstack._internal.core.models.users import GlobalRole, ProjectRole from dstack._internal.server.db import get_session -from dstack._internal.server.models import ProjectModel, UserModel +from dstack._internal.server.models import ( + ExportedFleetModel, + FleetModel, + ImportModel, + InstanceModel, + MemberModel, + ProjectModel, + UserModel, +) from dstack._internal.server.services.projects import ( get_project_model_by_name, get_user_project_role, @@ -18,6 +28,7 @@ error_invalid_token, error_not_found, ) +from dstack._internal.utils.common import EntityName, EntityNameOrID class Authenticated: @@ -249,3 +260,58 @@ async def is_project_member(session: AsyncSession, project_name: str, token: str return True except HTTPException: return False + + +async def check_can_access_fleet( + session: AsyncSession, + user: UserModel, + fleet_project: ProjectModel, + fleet_name_or_id: EntityNameOrID, +) -> None: + if ( + user.global_role == GlobalRole.ADMIN + or get_user_project_role(user=user, project=fleet_project) is not None + ): + return + filters = [ + FleetModel.project_id == fleet_project.id, + exists().where( + MemberModel.user_id == user.id, + MemberModel.project_id == ImportModel.project_id, + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == FleetModel.id, + ), + ] + if isinstance(fleet_name_or_id, EntityName): + filters.extend([FleetModel.name == fleet_name_or_id.name, FleetModel.deleted == False]) + else: + filters.append(FleetModel.id == fleet_name_or_id.id) + res = await session.execute(select(func.count()).select_from(FleetModel).where(*filters)) + if res.scalar_one() == 0: + raise error_forbidden() + + +async def check_can_access_instance( + session: AsyncSession, + user: UserModel, + instance_project: ProjectModel, + instance_id: UUID, +) -> None: + if ( + user.global_role == GlobalRole.ADMIN + or get_user_project_role(user=user, project=instance_project) is not None + ): + return + filters = [ + InstanceModel.project_id == instance_project.id, + InstanceModel.id == instance_id, + exists().where( + MemberModel.user_id == user.id, + MemberModel.project_id == ImportModel.project_id, + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == InstanceModel.fleet_id, + ), + ] + res = await session.execute(select(func.count()).select_from(InstanceModel).where(*filters)) + if res.scalar_one() == 0: + raise error_forbidden() diff --git a/src/dstack/_internal/server/services/events.py b/src/dstack/_internal/server/services/events.py index d46b43e201..dd7b33dc7f 100644 --- a/src/dstack/_internal/server/services/events.py +++ b/src/dstack/_internal/server/services/events.py @@ -252,14 +252,14 @@ async def list_events( limit: int, ascending: bool, ) -> list[Event]: - target_filters = [] + target_visibility_filters = [] if user.global_role != GlobalRole.ADMIN: query = select(MemberModel.project_id).where(MemberModel.user_id == user.id) res = await session.execute(query) # In Postgres, fetching project IDs separately is orders of magnitude faster # than using a subquery. project_ids = list(res.unique().scalars().all()) - target_filters.append( + target_visibility_filters.append( or_( EventTargetModel.entity_project_id.in_(project_ids), and_( @@ -269,6 +269,7 @@ async def list_events( ), ) ) + target_filters = [] if target_projects is not None: target_filters.append( and_( @@ -426,6 +427,8 @@ async def list_events( if event_filters: query = query.where(*event_filters) if target_filters: + # Each returned event should reference at least one target the user **wants** to see + # (as defined by user-provided filters). query = query.where( exists().where( and_( @@ -434,6 +437,17 @@ async def list_events( ) ) ) + if target_visibility_filters: + # Each returned event should reference at least one target the user **can** see + # (as defined by project membership). + query = query.where( + exists().where( + and_( + EventTargetModel.event_id == EventModel.id, + *target_visibility_filters, + ) + ) + ) res = await session.execute(query) event_models = res.unique().scalars().all() return list(map(event_model_to_event, event_models)) diff --git a/src/dstack/_internal/server/services/fleets.py b/src/dstack/_internal/server/services/fleets.py index c0ec21aeaa..ca5a2e7b4f 100644 --- a/src/dstack/_internal/server/services/fleets.py +++ b/src/dstack/_internal/server/services/fleets.py @@ -4,7 +4,7 @@ from functools import wraps from typing import List, Literal, Optional, Tuple, TypeVar, Union -from sqlalchemy import and_, func, or_, select +from sqlalchemy import and_, exists, false, func, or_, select from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import aliased, joinedload, selectinload @@ -53,7 +53,9 @@ from dstack._internal.core.services.diff import ModelDiff, copy_model, diff_models from dstack._internal.server.db import get_db, is_db_postgres, is_db_sqlite from dstack._internal.server.models import ( + ExportedFleetModel, FleetModel, + ImportModel, InstanceModel, JobModel, MemberModel, @@ -82,6 +84,7 @@ ) from dstack._internal.server.services.resources import set_resources_defaults from dstack._internal.utils import random_names +from dstack._internal.utils.common import EntityID, EntityName, EntityNameOrID from dstack._internal.utils.logging import get_logger from dstack._internal.utils.ssh import pkey_from_str @@ -193,6 +196,7 @@ async def list_fleets( user: UserModel, project_name: Optional[str], only_active: bool, + include_imported: bool, prev_created_at: Optional[datetime], prev_id: Optional[uuid.UUID], limit: int, @@ -209,6 +213,7 @@ async def list_fleets( session=session, projects=projects, only_active=only_active, + include_imported=include_imported, prev_created_at=prev_created_at, prev_id=prev_id, limit=limit, @@ -221,13 +226,25 @@ async def list_projects_fleet_models( session: AsyncSession, projects: List[ProjectModel], only_active: bool, + include_imported: bool, prev_created_at: Optional[datetime], prev_id: Optional[uuid.UUID], limit: int, ascending: bool, ) -> List[FleetModel]: filters = [] - filters.append(FleetModel.project_id.in_(p.id for p in projects)) + project_ids = {p.id for p in projects} + is_fleet_imported_subquery = exists().where( + ImportModel.project_id.in_(project_ids), + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == FleetModel.id, + ) + filters.append( + or_( + FleetModel.project_id.in_(project_ids), + is_fleet_imported_subquery if include_imported else false(), + ) + ) if only_active: filters.append(FleetModel.deleted == False) if prev_created_at is not None: @@ -259,7 +276,10 @@ async def list_projects_fleet_models( .where(*filters) .order_by(*order_by) .limit(limit) - .options(selectinload(FleetModel.instances.and_(InstanceModel.deleted == False))) + .options( + joinedload(FleetModel.project).load_only(ProjectModel.name), + selectinload(FleetModel.instances.and_(InstanceModel.deleted == False)), + ) ) fleet_models = list(res.unique().scalars().all()) return fleet_models @@ -269,8 +289,11 @@ async def list_project_fleets( session: AsyncSession, project: ProjectModel, names: Optional[List[str]] = None, + include_imported: bool = False, ) -> List[Fleet]: - fleet_models = await list_project_fleet_models(session=session, project=project, names=names) + fleet_models = await list_project_fleet_models( + session=session, project=project, names=names, include_imported=include_imported + ) return [fleet_model_to_fleet(v) for v in fleet_models] @@ -278,11 +301,21 @@ async def list_project_fleet_models( session: AsyncSession, project: ProjectModel, names: Optional[List[str]] = None, + include_imported: bool = False, include_deleted: bool = False, ) -> List[FleetModel]: - filters = [ - FleetModel.project_id == project.id, - ] + filters = [] + is_fleet_imported_subquery = exists().where( + ImportModel.project_id == project.id, + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == FleetModel.id, + ) + filters.append( + or_( + FleetModel.project_id == project.id, + is_fleet_imported_subquery if include_imported else false(), + ) + ) if names is not None: filters.append(FleetModel.name.in_(names)) if not include_deleted: @@ -290,7 +323,10 @@ async def list_project_fleet_models( res = await session.execute( select(FleetModel) .where(*filters) - .options(selectinload(FleetModel.instances.and_(InstanceModel.deleted == False))) + .options( + joinedload(FleetModel.project).load_only(ProjectModel.name), + selectinload(FleetModel.instances.and_(InstanceModel.deleted == False)), + ) ) return list(res.unique().scalars().all()) @@ -298,20 +334,17 @@ async def list_project_fleet_models( async def get_fleet( session: AsyncSession, project: ProjectModel, - name: Optional[str] = None, - fleet_id: Optional[uuid.UUID] = None, + name_or_id: EntityNameOrID, include_sensitive: bool = False, ) -> Optional[Fleet]: - if fleet_id is not None: + if isinstance(name_or_id, EntityID): fleet_model = await get_project_fleet_model_by_id( - session=session, project=project, fleet_id=fleet_id + session=session, project=project, fleet_id=name_or_id.id ) - elif name is not None: + else: fleet_model = await get_project_fleet_model_by_name( - session=session, project=project, name=name + session=session, project=project, name=name_or_id.name ) - else: - raise ServerClientError("name or id must be specified") if fleet_model is None: return None return fleet_model_to_fleet(fleet_model, include_sensitive=include_sensitive) @@ -329,7 +362,10 @@ async def get_project_fleet_model_by_id( res = await session.execute( select(FleetModel) .where(*filters) - .options(joinedload(FleetModel.instances.and_(InstanceModel.deleted == False))) + .options( + joinedload(FleetModel.instances.and_(InstanceModel.deleted == False)), + joinedload(FleetModel.project).load_only(ProjectModel.name), + ) ) return res.unique().scalar_one_or_none() @@ -349,7 +385,10 @@ async def get_project_fleet_model_by_name( res = await session.execute( select(FleetModel) .where(*filters) - .options(joinedload(FleetModel.instances.and_(InstanceModel.deleted == False))) + .options( + joinedload(FleetModel.instances.and_(InstanceModel.deleted == False)), + joinedload(FleetModel.project).load_only(ProjectModel.name), + ) ) return res.unique().scalar_one_or_none() @@ -379,7 +418,7 @@ async def get_plan( current_fleet = await get_fleet( session=session, project=project, - name=effective_spec.configuration.name, + name_or_id=EntityName(effective_spec.configuration.name), include_sensitive=True, ) if current_fleet is not None: diff --git a/src/dstack/_internal/server/services/instances.py b/src/dstack/_internal/server/services/instances.py index 046f092c03..079faf90c7 100644 --- a/src/dstack/_internal/server/services/instances.py +++ b/src/dstack/_internal/server/services/instances.py @@ -5,7 +5,7 @@ from typing import Dict, List, Literal, Optional, Union import gpuhunt -from sqlalchemy import and_, or_, select +from sqlalchemy import and_, exists, false, or_, select from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload, load_only @@ -42,7 +42,9 @@ from dstack._internal.core.services.profiles import get_termination from dstack._internal.server import settings as server_settings from dstack._internal.server.models import ( + ExportedFleetModel, FleetModel, + ImportModel, InstanceHealthCheckModel, InstanceModel, ProjectModel, @@ -516,13 +518,23 @@ async def list_projects_instance_models( projects: List[ProjectModel], fleet_ids: Optional[Iterable[uuid.UUID]], only_active: bool, + include_imported: bool, prev_created_at: Optional[datetime], prev_id: Optional[uuid.UUID], limit: int, ascending: bool, ) -> List[InstanceModel]: + project_ids = [p.id for p in projects] + is_instance_imported_subquery = exists().where( + ImportModel.project_id.in_(project_ids), + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == InstanceModel.fleet_id, + ) filters: List = [ - InstanceModel.project_id.in_(p.id for p in projects), + or_( + InstanceModel.project_id.in_(project_ids), + is_instance_imported_subquery if include_imported else false(), + ) ] if fleet_ids is not None: filters.append(InstanceModel.fleet_id.in_(fleet_ids)) @@ -569,7 +581,10 @@ async def list_projects_instance_models( .where(*filters) .order_by(*order_by) .limit(limit) - .options(joinedload(InstanceModel.fleet)) + .options( + joinedload(InstanceModel.fleet), + joinedload(InstanceModel.project).load_only(ProjectModel.name), + ) ) instance_models = list(res.unique().scalars().all()) return instance_models @@ -581,6 +596,7 @@ async def list_user_instances( project_names: Optional[Container[str]], fleet_ids: Optional[Iterable[uuid.UUID]], only_active: bool, + include_imported: bool, prev_created_at: Optional[datetime], prev_id: Optional[uuid.UUID], limit: int, @@ -600,6 +616,7 @@ async def list_user_instances( projects=projects, fleet_ids=fleet_ids, only_active=only_active, + include_imported=include_imported, prev_created_at=prev_created_at, prev_id=prev_id, limit=limit, diff --git a/src/dstack/_internal/server/services/jobs/__init__.py b/src/dstack/_internal/server/services/jobs/__init__.py index eb10bda5c4..bf0f65bb6f 100644 --- a/src/dstack/_internal/server/services/jobs/__init__.py +++ b/src/dstack/_internal/server/services/jobs/__init__.py @@ -273,10 +273,7 @@ def _get_job_configurator( async def stop_runner(session: AsyncSession, job_model: JobModel): res = await session.execute( select(InstanceModel) - .where( - InstanceModel.project_id == job_model.project_id, - InstanceModel.id == job_model.instance_id, - ) + .where(InstanceModel.id == job_model.instance_id) .options(joinedload(InstanceModel.project)) ) instance: Optional[InstanceModel] = res.scalar() diff --git a/src/dstack/_internal/server/services/runs/plan.py b/src/dstack/_internal/server/services/runs/plan.py index 5e3b6e5a02..4738622a07 100644 --- a/src/dstack/_internal/server/services/runs/plan.py +++ b/src/dstack/_internal/server/services/runs/plan.py @@ -1,9 +1,9 @@ import math from typing import Optional, Union -from sqlalchemy import and_, not_, or_, select +from sqlalchemy import and_, exists, not_, or_, select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import contains_eager, noload +from sqlalchemy.orm import contains_eager, joinedload, noload from dstack._internal.core.backends.base.backend import Backend from dstack._internal.core.models.fleets import Fleet, InstanceGroupPlacement @@ -21,7 +21,14 @@ RunSpec, ) from dstack._internal.core.models.volumes import Volume -from dstack._internal.server.models import FleetModel, InstanceModel, ProjectModel, RunModel +from dstack._internal.server.models import ( + ExportedFleetModel, + FleetModel, + ImportModel, + InstanceModel, + ProjectModel, + RunModel, +) from dstack._internal.server.services.fleets import ( check_can_create_new_cloud_instance_in_fleet, fleet_model_to_fleet, @@ -206,8 +213,16 @@ async def get_run_candidate_fleet_models_filters( # If another job freed the instance but is still trying to detach volumes, # do not provision on it to prevent attaching volumes that are currently detaching. detaching_instances_ids = await get_instances_ids_with_detaching_volumes(session) + is_fleet_imported_subquery = exists().where( + ImportModel.project_id == project.id, + ImportModel.export_id == ExportedFleetModel.export_id, + ExportedFleetModel.fleet_id == FleetModel.id, + ) fleet_filters = [ - FleetModel.project_id == project.id, + or_( + FleetModel.project_id == project.id, + is_fleet_imported_subquery, + ), FleetModel.deleted == False, ] if run_model is not None and run_model.fleet is not None: @@ -235,7 +250,12 @@ async def select_run_candidate_fleet_models_with_filters( .join(FleetModel.instances) .where(*fleet_filters) .where(*instance_filters) - .options(contains_eager(FleetModel.instances)) + .options( + contains_eager(FleetModel.instances), + joinedload(FleetModel.project) + .load_only(ProjectModel.name) + .joinedload(ProjectModel.backends), + ) .execution_options(populate_existing=True) ) if lock_instances: diff --git a/src/dstack/_internal/server/testing/common.py b/src/dstack/_internal/server/testing/common.py index 6bff65dea3..2893418a0d 100644 --- a/src/dstack/_internal/server/testing/common.py +++ b/src/dstack/_internal/server/testing/common.py @@ -93,10 +93,13 @@ ComputeGroupModel, DecryptedString, EventModel, + ExportedFleetModel, + ExportModel, FileArchiveModel, FleetModel, GatewayComputeModel, GatewayModel, + ImportModel, InstanceHealthCheckModel, InstanceModel, JobMetricsPoint, @@ -514,6 +517,24 @@ async def create_compute_group( return compute_group +async def create_export( + session: AsyncSession, + exporter_project: ProjectModel, + importer_projects: list[ProjectModel], + exported_fleets: list[FleetModel], + name: str = "test_export", +) -> ExportModel: + export = ExportModel( + name=name, + project=exporter_project, + imports=[ImportModel(project=project) for project in importer_projects], + exported_fleets=[ExportedFleetModel(fleet=fleet) for fleet in exported_fleets], + ) + session.add(export) + await session.commit() + return export + + async def create_probe( session: AsyncSession, job: JobModel, diff --git a/src/dstack/_internal/utils/common.py b/src/dstack/_internal/utils/common.py index ba139c6bfc..2db91882ff 100644 --- a/src/dstack/_internal/utils/common.py +++ b/src/dstack/_internal/utils/common.py @@ -4,16 +4,32 @@ import re import time from collections.abc import Callable +from dataclasses import dataclass from datetime import datetime, timedelta, timezone from functools import partial from pathlib import Path -from typing import Any, Iterable, List, Optional, TypeVar +from typing import Any, Iterable, List, Optional, TypeVar, Union from urllib.parse import urlparse +from uuid import UUID from typing_extensions import ParamSpec from dstack._internal.core.models.common import Duration + +@dataclass +class EntityName: + name: str + + +@dataclass +class EntityID: + id: UUID + + +EntityNameOrID = Union[EntityName, EntityID] + + P = ParamSpec("P") R = TypeVar("R") diff --git a/src/dstack/api/server/_fleets.py b/src/dstack/api/server/_fleets.py index 9bfb1cb422..95bb22e82d 100644 --- a/src/dstack/api/server/_fleets.py +++ b/src/dstack/api/server/_fleets.py @@ -16,13 +16,15 @@ DeleteFleetsRequest, GetFleetPlanRequest, GetFleetRequest, + ListProjectFleetsRequest, ) from dstack.api.server._group import APIClientGroup class FleetsAPIClient(APIClientGroup): - def list(self, project_name: str) -> List[Fleet]: - resp = self._request(f"/api/project/{project_name}/fleets/list") + def list(self, project_name: str, *, include_imported: bool = False) -> List[Fleet]: + body = ListProjectFleetsRequest(include_imported=include_imported) + resp = self._request(f"/api/project/{project_name}/fleets/list", body=body.json()) return parse_obj_as(List[Fleet.__response__], resp.json()) def get( diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_submitted_jobs.py b/src/tests/_internal/server/background/scheduled_tasks/test_submitted_jobs.py index f33f608c71..db75bbf530 100644 --- a/src/tests/_internal/server/background/scheduled_tasks/test_submitted_jobs.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_submitted_jobs.py @@ -21,6 +21,7 @@ JobStatus, JobTerminationReason, ) +from dstack._internal.core.models.users import GlobalRole from dstack._internal.core.models.volumes import ( InstanceMountPoint, VolumeAttachmentData, @@ -41,6 +42,7 @@ from dstack._internal.server.settings import JobNetworkMode from dstack._internal.server.testing.common import ( ComputeMockSpec, + create_export, create_fleet, create_instance, create_job, @@ -55,6 +57,7 @@ get_job_provisioning_data, get_placement_group_provisioning_data, get_run_spec, + get_ssh_fleet_configuration, get_volume_provisioning_data, ) @@ -365,6 +368,108 @@ async def test_assignes_job_to_instance(self, test_db, session: AsyncSession): job.instance_assigned and job.instance is not None and job.instance.id == instance.id ) + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_assigns_job_to_imported_fleet(self, test_db, session: AsyncSession): + exporter_user = await create_user( + session, name="exporter-user", global_role=GlobalRole.USER + ) + importer_user = await create_user( + session, name="importer_user", global_role=GlobalRole.USER + ) + exporter_project = await create_project( + session, name="exporter-project", owner=exporter_user + ) + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + repo = await create_repo(session=session, project_id=importer_project.id) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + run = await create_run( + session=session, + project=importer_project, + repo=repo, + user=importer_user, + ) + job = await create_job( + session=session, + run=run, + instance_assigned=False, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + await process_submitted_jobs() + await session.refresh(job) + res = await session.execute(select(JobModel).options(joinedload(JobModel.instance))) + job = res.unique().scalar_one() + assert job.status == JobStatus.SUBMITTED + assert ( + job.instance_assigned and job.instance is not None and job.instance.id == instance.id + ) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_not_assigns_job_to_foreign_fleet_if_not_imported( + self, test_db, session: AsyncSession + ): + exporter_user = await create_user( + session, name="exporter-user", global_role=GlobalRole.USER + ) + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project( + session, name="exporter-project", owner=exporter_user + ) + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + repo = await create_repo(session=session, project_id=importer_project.id) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + run = await create_run( + session=session, + project=importer_project, + repo=repo, + user=importer_user, + ) + job = await create_job( + session=session, + run=run, + instance_assigned=False, + ) + await process_submitted_jobs() + await session.refresh(job) + res = await session.execute(select(JobModel).options(joinedload(JobModel.instance))) + job = res.unique().scalar_one() + assert job.status == JobStatus.TERMINATING + assert job.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY + assert not job.instance_assigned + assert job.instance is None + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_does_no_reuse_unavailable_instances(self, test_db, session: AsyncSession): diff --git a/src/tests/_internal/server/routers/test_events.py b/src/tests/_internal/server/routers/test_events.py index f31c082d06..cb8e44b85a 100644 --- a/src/tests/_internal/server/routers/test_events.py +++ b/src/tests/_internal/server/routers/test_events.py @@ -3,6 +3,7 @@ from unittest.mock import patch import pytest +import pytest_asyncio from freezegun import freeze_time from httpx import AsyncClient from sqlalchemy.ext.asyncio import AsyncSession @@ -11,6 +12,7 @@ from dstack._internal.server.services import events from dstack._internal.server.services.projects import add_project_member from dstack._internal.server.testing.common import ( + create_export, create_fleet, create_instance, create_job, @@ -19,6 +21,8 @@ create_run, create_user, get_auth_headers, + get_fleet_spec, + get_ssh_fleet_configuration, ) pytestmark = [ @@ -1326,3 +1330,227 @@ async def test_limits_events_regardless_number_of_targets( ) resp.raise_for_status() assert len(resp.json()) == 2 + + +class TestListEventsWithExportedFleet: + @pytest_asyncio.fixture + async def exported_fleet_setup(self, session: AsyncSession): + # Create exporter user and project + exporter_user = await create_user( + session, name="exporter-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project( + session, name="exporter-project", owner=exporter_user + ) + await add_project_member( + session=session, + project=exporter_project, + user=exporter_user, + project_role=ProjectRole.USER, + ) + + # Create first importer user and project + importer_user_1 = await create_user( + session, name="importer-user-1", global_role=GlobalRole.USER + ) + importer_project_1 = await create_project( + session, name="importer-project-1", owner=importer_user_1 + ) + await add_project_member( + session=session, + project=importer_project_1, + user=importer_user_1, + project_role=ProjectRole.USER, + ) + + # Create second importer user and project + importer_user_2 = await create_user( + session, name="importer-user-2", global_role=GlobalRole.USER + ) + importer_project_2 = await create_project( + session, name="importer-project-2", owner=importer_user_2 + ) + await add_project_member( + session=session, + project=importer_project_2, + user=importer_user_2, + project_role=ProjectRole.USER, + ) + + # Create fleet and instance + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + events.emit( + session=session, + message="Fleet created", + actor=events.UserActor.from_user(exporter_user), + targets=[events.Target.from_model(fleet)], + ) + instance = await create_instance( + session=session, project=exporter_project, fleet=fleet, name="exported-fleet-0" + ) + events.emit( + session=session, + message="Instance created", + actor=events.SystemActor(), + targets=[events.Target.from_model(instance)], + ) + + # Create export + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project_1, importer_project_2], + exported_fleets=[fleet], + ) + + # Create first importer run and job + importer_run_1 = await create_run( + session=session, + project=importer_project_1, + user=importer_user_1, + repo=await create_repo(session=session, project_id=importer_project_1.id), + run_name="importer-run-1", + ) + events.emit( + session=session, + message="Run created", + actor=events.UserActor.from_user(importer_user_1), + targets=[events.Target.from_model(importer_run_1)], + ) + importer_job_1 = await create_job( + session=session, + run=importer_run_1, + fleet=fleet, + instance=instance, + ) + events.emit( + session=session, + message="Job assigned to instance", + actor=events.SystemActor(), + targets=[events.Target.from_model(importer_job_1), events.Target.from_model(instance)], + ) + + # Create second importer run and job + importer_run_2 = await create_run( + session=session, + project=importer_project_2, + user=importer_user_2, + repo=await create_repo(session=session, project_id=importer_project_2.id), + run_name="importer-run-2", + ) + events.emit( + session=session, + message="Run created", + actor=events.UserActor.from_user(importer_user_2), + targets=[events.Target.from_model(importer_run_2)], + ) + importer_job_2 = await create_job( + session=session, + run=importer_run_2, + fleet=fleet, + instance=instance, + ) + events.emit( + session=session, + message="Job assigned to instance", + actor=events.SystemActor(), + targets=[events.Target.from_model(importer_job_2), events.Target.from_model(instance)], + ) + + await session.commit() + + return { + "exporter_user": exporter_user, + "importer_user_1": importer_user_1, + "importer_user_2": importer_user_2, + "exported_fleet": fleet, + } + + @pytest.mark.parametrize("with_filter", [True, False]) + async def test_exporter_user_sees_all_events_targeting_exported_fleet( + self, + session: AsyncSession, + client: AsyncClient, + exported_fleet_setup: dict, + with_filter: bool, + ) -> None: + filters = {} + if with_filter: + filters = {"within_fleets": [str(exported_fleet_setup["exported_fleet"].id)]} + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(exported_fleet_setup["exporter_user"].token), + json={"ascending": True, **filters}, + ) + resp.raise_for_status() + assert resp.json()[0]["message"] == "Fleet created" + assert resp.json()[1]["message"] == "Instance created" + assert resp.json()[2]["message"] == "Job assigned to instance" + assert {t["name"] for t in resp.json()[2]["targets"]} == { + "exported-fleet-0", + "importer-run-1-0-0", + } + assert resp.json()[3]["message"] == "Job assigned to instance" + assert {t["name"] for t in resp.json()[3]["targets"]} == { + "exported-fleet-0", + "importer-run-2-0-0", + } + assert len(resp.json()) == 4 + + @pytest.mark.parametrize( + ("user_key", "job_name"), + [ + ("importer_user_1", "importer-run-1-0-0"), + ("importer_user_2", "importer-run-2-0-0"), + ], + ) + async def test_importer_user_sees_only_events_about_their_own_run( + self, + session: AsyncSession, + client: AsyncClient, + exported_fleet_setup: dict, + user_key: str, + job_name: str, + ) -> None: + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(exported_fleet_setup[user_key].token), + json={"ascending": True}, + ) + resp.raise_for_status() + assert resp.json()[0]["message"] == "Run created" + assert resp.json()[1]["message"] == "Job assigned to instance" + assert {t["name"] for t in resp.json()[1]["targets"]} == {"exported-fleet-0", job_name} + assert len(resp.json()) == 2 + + @pytest.mark.parametrize( + ("user_key", "job_name"), + [ + ("importer_user_1", "importer-run-1-0-0"), + ("importer_user_2", "importer-run-2-0-0"), + ], + ) + async def test_importer_user_can_filter_by_imported_fleet( + self, + session: AsyncSession, + client: AsyncClient, + exported_fleet_setup: dict, + user_key: str, + job_name: str, + ) -> None: + resp = await client.post( + "/api/events/list", + headers=get_auth_headers(exported_fleet_setup[user_key].token), + json={ + "ascending": True, + "within_fleets": [str(exported_fleet_setup["exported_fleet"].id)], + }, + ) + resp.raise_for_status() + assert resp.json()[0]["message"] == "Job assigned to instance" + assert {t["name"] for t in resp.json()[0]["targets"]} == {"exported-fleet-0", job_name} + assert len(resp.json()) == 1 diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py index fed647d2c8..12108eed31 100644 --- a/src/tests/_internal/server/routers/test_fleets.py +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -31,6 +31,7 @@ from dstack._internal.server.services.permissions import DefaultPermissions from dstack._internal.server.services.projects import add_project_member from dstack._internal.server.testing.common import ( + create_export, create_fleet, create_instance, create_job, @@ -141,6 +142,223 @@ async def test_non_admin_cannot_see_others_projects( assert len(response_json) == 1 assert response_json[0]["project_name"] == "project1" + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize("with_project_name_filter", [True, False]) + async def test_returns_imported_fleet_with_include_imported( + self, test_db, session: AsyncSession, client: AsyncClient, with_project_name_filter: bool + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-fleet")), + ) + response = await client.post( + "/api/fleets/list", + headers=get_auth_headers(importer_user.token), + json={ + "include_imported": True, + "project_name": "importer-project" if with_project_name_filter else None, + }, + ) + assert response.status_code == 200 + response_json = response.json() + response_json.sort(key=lambda f: f["name"]) + assert len(response_json) == 2 + assert response_json[0]["name"] == "exported-fleet" + assert response_json[0]["project_name"] == "exporter-project" + assert len(response_json[0]["instances"]) == 1 + assert response_json[0]["instances"][0]["id"] == str(instance.id) + assert response_json[1]["name"] == "local-fleet" + assert response_json[1]["project_name"] == "importer-project" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_not_returns_imported_fleet_without_include_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-fleet")), + ) + response = await client.post( + "/api/fleets/list", + headers=get_auth_headers(importer_user.token), + json={}, + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "local-fleet" + assert response_json[0]["project_name"] == "importer-project" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_imported_fleet_once_when_user_member_of_both_projects( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, name="user", global_role=GlobalRole.USER) + exporter_project = await create_project(session, name="exporter-project", owner=user) + importer_project = await create_project(session, name="importer-project", owner=user) + await add_project_member( + session=session, + project=exporter_project, + user=user, + project_role=ProjectRole.USER, + ) + await add_project_member( + session=session, + project=importer_project, + user=user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="shared-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-exporter-fleet")), + ) + await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-importer-fleet")), + ) + response = await client.post( + "/api/fleets/list", + headers=get_auth_headers(user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + response_json.sort(key=lambda f: f["name"]) + assert len(response_json) == 3 + assert response_json[0]["name"] == "local-exporter-fleet" + assert response_json[0]["project_name"] == "exporter-project" + assert response_json[1]["name"] == "local-importer-fleet" + assert response_json[1]["project_name"] == "importer-project" + assert response_json[2]["name"] == "shared-fleet" + assert response_json[2]["project_name"] == "exporter-project" + assert len(response_json[2]["instances"]) == 1 + assert response_json[2]["instances"][0]["id"] == str(instance.id) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_fleet_once_if_imported_twice( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + for name in ["export-1", "export-2"]: + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + name=name, + ) + response = await client.post( + "/api/fleets/list", + headers=get_auth_headers(importer_user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "exported-fleet" + assert response_json[0]["project_name"] == "exporter-project" + assert len(response_json[0]["instances"]) == 1 + assert response_json[0]["instances"][0]["id"] == str(instance.id) + class TestListProjectFleets: @pytest.mark.asyncio @@ -182,6 +400,155 @@ async def test_lists_fleets(self, test_db, session: AsyncSession, client: AsyncC } ] + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_imported_fleet_with_include_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-fleet")), + ) + response = await client.post( + f"/api/project/{importer_project.name}/fleets/list", + headers=get_auth_headers(importer_user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + response_json.sort(key=lambda f: f["name"]) + assert len(response_json) == 2 + assert response_json[0]["name"] == "exported-fleet" + assert response_json[0]["project_name"] == "exporter-project" + assert len(response_json[0]["instances"]) == 1 + assert response_json[0]["instances"][0]["id"] == str(instance.id) + assert response_json[1]["name"] == "local-fleet" + assert response_json[1]["project_name"] == "importer-project" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_not_returns_imported_fleet_without_include_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-fleet")), + ) + response = await client.post( + f"/api/project/{importer_project.name}/fleets/list", + headers=get_auth_headers(importer_user.token), + json={}, # No include_imported parameter + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "local-fleet" + assert response_json[0]["project_name"] == "importer-project" + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_fleet_once_if_imported_twice( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + for name in ["export-1", "export-2"]: + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + name=name, + ) + response = await client.post( + f"/api/project/{importer_project.name}/fleets/list", + headers=get_auth_headers(importer_user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "exported-fleet" + assert response_json[0]["project_name"] == "exporter-project" + assert len(response_json[0]["instances"]) == 1 + assert response_json[0]["instances"][0]["id"] == str(instance.id) + class TestGetFleet: @pytest.mark.asyncio @@ -371,6 +738,115 @@ async def test_returns_foreign_fleet_to_global_admin( assert response.status_code == 200 assert response.json()["name"] == "test-fleet" + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "by_id", [pytest.param(False, id="by-name"), pytest.param(False, id="by-id")] + ) + async def test_returns_imported_fleet( + self, test_db, session: AsyncSession, client: AsyncClient, by_id: bool + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + if by_id: + body = {"id": str(fleet.id)} + else: + body = {"name": "exported-fleet"} + response = await client.post( + "/api/project/exporter-project/fleets/get", + headers=get_auth_headers(importer_user.token), + json=body, + ) + assert response.status_code == 200 + assert response.json()["id"] == str(fleet.id) + assert response.json()["name"] == "exported-fleet" + assert response.json()["project_name"] == "exporter-project" + assert len(response.json()["instances"]) == 1 + assert response.json()["instances"][0]["id"] == str(instance.id) + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + @pytest.mark.parametrize( + "by_id", [pytest.param(False, id="by-name"), pytest.param(False, id="by-id")] + ) + async def test_returns_403_on_foreign_fleet_if_not_imported( + self, test_db, session: AsyncSession, client: AsyncClient, by_id: bool + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + not_importer_user = await create_user( + session, name="not-importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project( + session, name="exporter-project", owner=importer_user + ) + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + not_importer_project = await create_project( + session, name="not-importer-project", owner=not_importer_user + ) + await add_project_member( + session=session, + project=not_importer_project, + user=not_importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + if by_id: + body = {"id": str(fleet.id)} + else: + body = {"name": "exported-fleet"} + response = await client.post( + "/api/project/exporter-project/fleets/get", + headers=get_auth_headers(not_importer_user.token), + json=body, + ) + assert response.status_code == 403 + class TestApplyFleetPlan: @pytest.mark.asyncio @@ -918,6 +1394,43 @@ async def test_forbids_if_no_permission_to_manage_ssh_fleets( ) assert response.status_code in [401, 403] + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_importer_member_cannot_apply_plan_on_imported_fleet( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + spec = get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=spec, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/fleets/apply", + headers=get_auth_headers(importer_user.token), + json={"plan": {"spec": spec.dict()}, "force": False}, + ) + assert response.status_code == 403 + class TestDeleteFleets: @pytest.mark.asyncio @@ -1062,6 +1575,42 @@ async def test_forbids_if_no_permission_to_manage_ssh_fleets( ) assert response.status_code in [401, 403] + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_importer_member_cannot_delete_imported_fleet( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/fleets/delete", + headers=get_auth_headers(importer_user.token), + json={"names": [fleet.name]}, + ) + assert response.status_code == 403 + class TestDeleteFleetInstances: @pytest.mark.asyncio @@ -1188,6 +1737,48 @@ async def test_returns_400_when_fleet_locked( assert fleet.status != FleetStatus.TERMINATING assert instance.status != InstanceStatus.TERMINATING + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_importer_member_cannot_delete_imported_fleet_instances( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + instance_num=1, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/fleets/delete_instances", + headers=get_auth_headers(importer_user.token), + json={"name": fleet.name, "instance_nums": [1]}, + ) + assert response.status_code == 403 + class TestGetPlan: @pytest.mark.asyncio @@ -1384,6 +1975,39 @@ async def test_replaces_no_balance_with_not_available_for_old_clients( assert offers[0]["availability"] == InstanceAvailability.AVAILABLE.value assert offers[1]["availability"] == expected_availability.value + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_importer_member_cannot_get_plan_for_imported_fleet( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.ADMIN, + ) + spec = get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")) + fleet = await create_fleet(session=session, project=exporter_project, spec=spec) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + response = await client.post( + f"/api/project/{exporter_project.name}/fleets/get_plan", + headers=get_auth_headers(importer_user.token), + json={"spec": spec.dict()}, + ) + assert response.status_code == 403 + def _fleet_model_to_json_dict(fleet: FleetModel) -> dict: return json.loads(fleet_model_to_fleet(fleet).json()) diff --git a/src/tests/_internal/server/routers/test_instances.py b/src/tests/_internal/server/routers/test_instances.py index 45363bfd92..439538c14c 100644 --- a/src/tests/_internal/server/routers/test_instances.py +++ b/src/tests/_internal/server/routers/test_instances.py @@ -14,6 +14,7 @@ from dstack._internal.server.models import UserModel from dstack._internal.server.services.projects import add_project_member from dstack._internal.server.testing.common import ( + create_export, create_fleet, create_instance, create_instance_health_check, @@ -22,6 +23,7 @@ get_auth_headers, get_fleet_configuration, get_fleet_spec, + get_ssh_fleet_configuration, ) @@ -268,6 +270,240 @@ async def test_not_authenticated(self, client: AsyncClient, data) -> None: resp = await client.post("/api/instances/list", json={}) assert resp.status_code in [401, 403] + @pytest.mark.parametrize("with_project_name_filter", [True, False]) + async def test_returns_imported_instances_with_include_imported( + self, session: AsyncSession, client: AsyncClient, with_project_name_filter: bool + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_instance( + session=session, project=exporter_project, fleet=fleet, name="exported-fleet-0" + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + local_fleet = await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-fleet")), + ) + await create_instance( + session=session, project=importer_project, fleet=local_fleet, name="local-fleet-0" + ) + response = await client.post( + "/api/instances/list", + headers=get_auth_headers(importer_user.token), + json={ + "include_imported": True, + "project_names": ["importer-project"] if with_project_name_filter else None, + }, + ) + assert response.status_code == 200 + response_json = response.json() + response_json.sort(key=lambda i: i["name"]) + assert len(response_json) == 2 + assert response_json[0]["name"] == "exported-fleet-0" + assert response_json[0]["project_name"] == "exporter-project" + assert response_json[0]["fleet_name"] == "exported-fleet" + assert response_json[1]["name"] == "local-fleet-0" + assert response_json[1]["project_name"] == "importer-project" + assert response_json[1]["fleet_name"] == "local-fleet" + + async def test_not_returns_imported_instances_without_include_imported( + self, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + name="exported-fleet-0", + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + local_fleet = await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-fleet")), + ) + await create_instance( + session=session, project=importer_project, fleet=local_fleet, name="local-fleet-0" + ) + response = await client.post( + "/api/instances/list", + headers=get_auth_headers(importer_user.token), + json={}, # No include_imported + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "local-fleet-0" + assert response_json[0]["project_name"] == "importer-project" + assert response_json[0]["fleet_name"] == "local-fleet" + + async def test_returns_imported_instances_once_when_user_member_of_both_projects( + self, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, name="user", global_role=GlobalRole.USER) + exporter_project = await create_project(session, name="exporter-project", owner=user) + importer_project = await create_project(session, name="importer-project", owner=user) + await add_project_member( + session=session, + project=exporter_project, + user=user, + project_role=ProjectRole.USER, + ) + await add_project_member( + session=session, + project=importer_project, + user=user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="shared-fleet")), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + name="shared-fleet-0", + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + local_exporter_fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-exporter-fleet")), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=local_exporter_fleet, + name="local-exported-fleet-0", + ) + local_importer_fleet = await create_fleet( + session=session, + project=importer_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="local-importer-fleet")), + ) + await create_instance( + session=session, + project=importer_project, + fleet=local_importer_fleet, + name="local-importer-fleet-0", + ) + response = await client.post( + "/api/instances/list", + headers=get_auth_headers(user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + response_json.sort(key=lambda i: i["name"]) + assert len(response_json) == 3 + assert response_json[0]["name"] == "local-exported-fleet-0" + assert response_json[0]["project_name"] == "exporter-project" + assert response_json[0]["fleet_name"] == "local-exporter-fleet" + assert response_json[1]["name"] == "local-importer-fleet-0" + assert response_json[1]["project_name"] == "importer-project" + assert response_json[1]["fleet_name"] == "local-importer-fleet" + assert response_json[2]["name"] == "shared-fleet-0" + assert response_json[2]["project_name"] == "exporter-project" + assert response_json[2]["fleet_name"] == "shared-fleet" + + async def test_returns_instance_once_if_imported_twice( + self, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + name="exported-fleet-0", + ) + for name in ["export-1", "export-2"]: + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + name=name, + ) + response = await client.post( + "/api/instances/list", + headers=get_auth_headers(importer_user.token), + json={"include_imported": True}, + ) + assert response.status_code == 200 + response_json = response.json() + assert len(response_json) == 1 + assert response_json[0]["name"] == "exported-fleet-0" + assert response_json[0]["project_name"] == "exporter-project" + assert response_json[0]["fleet_name"] == "exported-fleet" + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) @@ -509,3 +745,93 @@ async def test_returns_403_if_not_project_member_and_instance_not_exists( json={"id": str(uuid.uuid4())}, ) assert resp.status_code == 403 + + async def test_returns_imported_instance( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + response = await client.post( + "/api/project/exporter-project/instances/get", + headers=get_auth_headers(importer_user.token), + json={"id": str(instance.id)}, + ) + assert response.status_code == 200 + response_json = response.json() + assert response_json["id"] == str(instance.id) + assert response_json["project_name"] == "exporter-project" + assert response_json["fleet_name"] == "exported-fleet" + + async def test_returns_403_on_foreign_instance_if_not_imported( + self, test_db, session: AsyncSession, client: AsyncClient + ): + importer_user = await create_user( + session, name="importer-user", global_role=GlobalRole.USER + ) + not_importer_user = await create_user( + session, name="not-importer-user", global_role=GlobalRole.USER + ) + exporter_project = await create_project( + session, name="exporter-project", owner=importer_user + ) + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + not_importer_project = await create_project( + session, name="not-importer-project", owner=not_importer_user + ) + await add_project_member( + session=session, + project=not_importer_project, + user=not_importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration(name="exported-fleet")), + ) + instance = await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + response = await client.post( + "/api/project/exporter-project/instances/get", + headers=get_auth_headers(not_importer_user.token), + json={"id": str(instance.id)}, + ) + assert response.status_code == 403 diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 1f6b1ebf3e..4d6e7aa95d 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -57,6 +57,7 @@ from dstack._internal.server.services.runs.spec import validate_run_spec_and_set_defaults from dstack._internal.server.testing.common import ( create_backend, + create_export, create_fleet, create_gateway, create_gateway_compute, @@ -70,6 +71,7 @@ get_fleet_spec, get_job_provisioning_data, get_run_spec, + get_ssh_fleet_configuration, list_events, ) from dstack._internal.server.testing.matchers import SomeUUID4Str @@ -1384,6 +1386,56 @@ async def test_returns_run_plan_instance_volumes( assert response.status_code == 200, response.json() assert response.json() == run_plan_dict + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_run_plan_with_offer_from_imported_fleet( + self, + test_db, + session: AsyncSession, + client: AsyncClient, + ) -> None: + importer_user = await create_user(session, global_role=GlobalRole.USER) + exporter_project = await create_project(session, name="exporter-project") + importer_project = await create_project( + session, name="importer-project", owner=importer_user + ) + await add_project_member( + session=session, + project=importer_project, + user=importer_user, + project_role=ProjectRole.USER, + ) + fleet = await create_fleet( + session=session, + project=exporter_project, + spec=get_fleet_spec(get_ssh_fleet_configuration()), + ) + await create_instance( + session=session, + project=exporter_project, + fleet=fleet, + instance_num=1, + backend=BackendType.REMOTE, + ) + await create_export( + session=session, + exporter_project=exporter_project, + importer_projects=[importer_project], + exported_fleets=[fleet], + ) + + run_spec = {"configuration": {"type": "dev-environment", "ide": "vscode"}} + body = {"run_spec": run_spec} + response = await client.post( + "/api/project/importer-project/runs/get_plan", + headers=get_auth_headers(importer_user.token), + json=body, + ) + assert response.status_code == 200, response.json() + response_json = response.json() + assert response_json["project_name"] == "importer-project" + assert response_json["job_plans"][0]["offers"][0]["backend"] == "remote" + @pytest.mark.parametrize( ("client_version", "expected_availability"), [ From 8bedd73777a82270056c3f7d0cf443f5f850f56a Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Thu, 5 Mar 2026 10:54:39 +0000 Subject: [PATCH 185/187] Prevent Hot Aisle min reservation period error (#3633) Force delete every Hot Aisle instance to prevent the deletion error if the minimum reservation period is not met and prevent orphaned instances. --- docs/docs/concepts/backends.md | 12 ++++++++++++ .../core/backends/hotaisle/api_client.py | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index 620d5723cb..9b46822e5f 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -891,6 +891,18 @@ projects: * **Owner role for the user** - Required for creating and managing SSH keys * **Operator role for the team** - Required for managing virtual machines within the team +??? info "Pricing" + `dstack` shows the hourly price for Hot Aisle instances. Some instances also require an upfront payment for a minimum reservation period, which is usually a few hours. You will be charged for the full minimum period even if you stop the instance early. + + See the Hot Aisle API for the minimum reservation period for each instance type: + +
    + + ```shell + $ curl -H "Authorization: Token $API_KEY" https://admin.hotaisle.app/api/teams/$TEAM_HANDLE/virtual_machines/available/ | jq ".[] | {gpus: .Specs.gpus, MinimumReservationMinutes}" + ``` + +
    ### CloudRift diff --git a/src/dstack/_internal/core/backends/hotaisle/api_client.py b/src/dstack/_internal/core/backends/hotaisle/api_client.py index c4608bd252..a3cc355fcd 100644 --- a/src/dstack/_internal/core/backends/hotaisle/api_client.py +++ b/src/dstack/_internal/core/backends/hotaisle/api_client.py @@ -76,14 +76,25 @@ def get_vm_state(self, vm_name: str) -> str: def terminate_virtual_machine(self, vm_name: str) -> None: url = f"{API_URL}/teams/{self.team_handle}/virtual_machines/{vm_name}/" - response = self._make_request("DELETE", url) + response = self._make_request( + "DELETE", + url, + params={ + "force": "true", # delete even if min reservation time not met + }, + ) if response.status_code == 404: logger.debug("Hot Aisle virtual machine %s not found", vm_name) return response.raise_for_status() def _make_request( - self, method: str, url: str, json: Optional[Dict[str, Any]] = None, timeout: int = 30 + self, + method: str, + url: str, + json: Optional[dict[str, Any]] = None, + params: Optional[dict[str, str]] = None, + timeout: int = 30, ) -> requests.Response: headers = { "accept": "application/json", @@ -97,5 +108,6 @@ def _make_request( url=url, headers=headers, json=json, + params=params, timeout=timeout, ) From 2dfe398c907ef3aa94d2a6ceba935b267b0a0cd9 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 6 Mar 2026 13:21:30 +0500 Subject: [PATCH 186/187] Implement instance pipeline (#3636) * Move delete_instance_health_checks to a separate module * Move utils/provisioning.py to ssh_fleets/provisioning.py * Use SSHProvisioningError for ssh instances errors * Fix _add_remote() nested try-excepts * Refactor _resolve_ssh_instance_network * Refactor _process_instance() into thin dispatcher * Refactor instance check code * Add fetchers tests * Add TestInstanceWorker * Run pyright for pipeline tests * WIP: InstanceWorker * Fix volumes pipeline processing active * Refactor log_lock_token * Build instance events from update map * Rename * Refactor instance pipeline into modules * Inline _get_effective_ helpers * Process new instance immediately * Do not refetch status * Fix sibling_update_rows * Drop redundant synchronize_session=False * Add ProcessContext * Simplify placement groups code * Drop _PlacementGroupState * Restore comments * Fix result.sibling_update_rows append * Fix unset typing * Add migration * Lock instances in fleet pipeline * Optimize instance lock in fleet pipeline * Respect instance lock in delete_fleets * Skip locked instances in process_next_terminating_job * Respect instance lock in submitted_jobs * Add ix_instances_pipeline_fetch_q_index * Wire instance pipeline * Set current_master_instance * Refactor current_master_instance * Terminate instances with MASTER_FAILED if the master dies with NO_OFFERS * Fix instance unlock in fleet pipeline * Remove extra fleet_model_to_fleet * Wire pipeline_hinter * Remove extra fleet_model_to_fleet * Fix index name * Rebase migrations * Fix redundant fleet.instances loads in instance pipeline * Do not lock all instances in delete_fleets * Add FIXME * Fix tests * Retry instance lock in delete_fleets * Retry lock in all delete endpoints * Fix created_at and last_processed_at init values --- AGENTS.md | 1 + pyproject.toml | 1 + src/dstack/_internal/core/errors.py | 4 + .../background/pipeline_tasks/__init__.py | 2 + .../server/background/pipeline_tasks/base.py | 47 +- .../pipeline_tasks/compute_groups.py | 18 +- .../background/pipeline_tasks/fleets.py | 490 ++++++--- .../background/pipeline_tasks/gateways.py | 54 +- .../pipeline_tasks/instances/__init__.py | 476 +++++++++ .../pipeline_tasks/instances/check.py | 568 +++++++++++ .../instances/cloud_provisioning.py | 421 ++++++++ .../pipeline_tasks/instances/common.py | 177 ++++ .../pipeline_tasks/instances/ssh_deploy.py | 302 ++++++ .../pipeline_tasks/instances/termination.py | 88 ++ .../pipeline_tasks/placement_groups.py | 16 +- .../background/pipeline_tasks/volumes.py | 32 +- .../background/scheduled_tasks/__init__.py | 26 +- .../scheduled_tasks/instance_healthchecks.py | 20 + .../background/scheduled_tasks/instances.py | 403 ++++---- .../scheduled_tasks/submitted_jobs.py | 4 + .../scheduled_tasks/terminating_jobs.py | 1 + ...0aa4_add_instancemodel_pipeline_columns.py | 47 + ...add_ix_instances_pipeline_fetch_q_index.py | 49 + ...4d986_add_fleet_current_master_instance.py | 37 + ...dd_ix_fleets_current_master_instance_id.py | 42 + src/dstack/_internal/server/models.py | 25 +- src/dstack/_internal/server/routers/fleets.py | 5 + .../_internal/server/services/fleets.py | 162 +-- .../server/services/gateways/__init__.py | 35 +- .../_internal/server/services/instances.py | 32 +- .../_internal/server/services/runs/plan.py | 9 +- .../server/services/ssh_fleets/__init__.py | 0 .../ssh_fleets}/provisioning.py | 58 +- .../_internal/server/services/volumes.py | 38 +- src/dstack/_internal/utils/common.py | 13 +- .../pipeline_tasks/test_compute_groups.py | 122 ++- .../background/pipeline_tasks/test_fleets.py | 764 +++++++++++++- .../pipeline_tasks/test_gateways.py | 178 +++- .../pipeline_tasks/test_instances/__init__.py | 0 .../pipeline_tasks/test_instances/conftest.py | 58 ++ .../pipeline_tasks/test_instances/helpers.py | 40 + .../test_instances/test_check.py | 944 ++++++++++++++++++ .../test_instances/test_cloud_provisioning.py | 872 ++++++++++++++++ .../test_instances/test_pipeline.py | 253 +++++ .../test_instances/test_ssh_deploy.py | 248 +++++ .../test_instances/test_termination.py | 219 ++++ .../pipeline_tasks/test_placement_groups.py | 151 ++- .../background/pipeline_tasks/test_volumes.py | 156 ++- .../test_instance_healthchecks.py | 49 + .../scheduled_tasks/test_instances.py | 148 ++- .../_internal/server/routers/test_fleets.py | 78 ++ .../server/services/test_instances.py | 118 +++ 52 files changed, 7535 insertions(+), 566 deletions(-) create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/instances/__init__.py create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/instances/check.py create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/instances/cloud_provisioning.py create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/instances/common.py create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/instances/ssh_deploy.py create mode 100644 src/dstack/_internal/server/background/pipeline_tasks/instances/termination.py create mode 100644 src/dstack/_internal/server/background/scheduled_tasks/instance_healthchecks.py create mode 100644 src/dstack/_internal/server/migrations/versions/2026/03_05_0547_8e8647f20aa4_add_instancemodel_pipeline_columns.py create mode 100644 src/dstack/_internal/server/migrations/versions/2026/03_05_0751_297c68450cc8_add_ix_instances_pipeline_fetch_q_index.py create mode 100644 src/dstack/_internal/server/migrations/versions/2026/03_05_1015_9cb8e4e4d986_add_fleet_current_master_instance.py create mode 100644 src/dstack/_internal/server/migrations/versions/2026/03_05_1045_c7b0a8e57294_add_ix_fleets_current_master_instance_id.py create mode 100644 src/dstack/_internal/server/services/ssh_fleets/__init__.py rename src/dstack/_internal/server/{utils => services/ssh_fleets}/provisioning.py (86%) create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_instances/__init__.py create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_instances/conftest.py create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_instances/helpers.py create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_instances/test_check.py create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_instances/test_cloud_provisioning.py create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_instances/test_pipeline.py create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_instances/test_ssh_deploy.py create mode 100644 src/tests/_internal/server/background/pipeline_tasks/test_instances/test_termination.py create mode 100644 src/tests/_internal/server/background/scheduled_tasks/test_instance_healthchecks.py diff --git a/AGENTS.md b/AGENTS.md index eb348b291b..336b97bb5b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -18,6 +18,7 @@ - Python targets 3.9+ with 4-space indentation and max line length of 99 (see `ruff.toml`; `E501` is ignored but keep lines readable). - Imports are sorted via Ruff’s isort settings (`dstack` treated as first-party). - Keep primary/public functions before local helper functions in a module section. +- Keep private classes, exceptions, and similar implementation-specific types close to the private functions that use them unless they are shared more broadly in the module. - Prefer pydantic-style models in `core/models`. - Tests use `test_*.py` modules and `test_*` functions; fixtures live near usage. diff --git a/pyproject.toml b/pyproject.toml index 259cbf7b25..8c53b2e166 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,6 +106,7 @@ include = [ "src/dstack/_internal/core/backends/runpod", "src/dstack/_internal/cli/services/configurators", "src/dstack/_internal/cli/commands", + "src/tests/_internal/server/background/pipeline_tasks", ] ignore = [ "src/dstack/_internal/server/migrations/versions", diff --git a/src/dstack/_internal/core/errors.py b/src/dstack/_internal/core/errors.py index 0bfd5f6f33..0d4262fe9b 100644 --- a/src/dstack/_internal/core/errors.py +++ b/src/dstack/_internal/core/errors.py @@ -136,6 +136,10 @@ class ConfigurationError(DstackError): pass +class SSHProvisioningError(DstackError): + pass + + class SSHError(DstackError): pass diff --git a/src/dstack/_internal/server/background/pipeline_tasks/__init__.py b/src/dstack/_internal/server/background/pipeline_tasks/__init__.py index 6b3762419f..556e13daaf 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/__init__.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/__init__.py @@ -4,6 +4,7 @@ from dstack._internal.server.background.pipeline_tasks.compute_groups import ComputeGroupPipeline from dstack._internal.server.background.pipeline_tasks.fleets import FleetPipeline from dstack._internal.server.background.pipeline_tasks.gateways import GatewayPipeline +from dstack._internal.server.background.pipeline_tasks.instances import InstancePipeline from dstack._internal.server.background.pipeline_tasks.placement_groups import ( PlacementGroupPipeline, ) @@ -19,6 +20,7 @@ def __init__(self) -> None: ComputeGroupPipeline(), FleetPipeline(), GatewayPipeline(), + InstancePipeline(), PlacementGroupPipeline(), VolumePipeline(), ] diff --git a/src/dstack/_internal/server/background/pipeline_tasks/base.py b/src/dstack/_internal/server/background/pipeline_tasks/base.py index aa5af9a4a3..76073b7893 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/base.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/base.py @@ -1,6 +1,8 @@ import asyncio +import logging import math import random +import time import uuid from abc import ABC, abstractmethod from collections.abc import Iterable, Sequence @@ -331,6 +333,7 @@ async def start(self): self._running = True while self._running: item = await self._queue.get() + start_time = time.time() logger.debug("Processing %s item %s", item.__tablename__, item.id) try: await self.process(item) @@ -338,7 +341,12 @@ async def start(self): logger.exception("Unexpected exception when processing item") finally: await self._heartbeater.untrack(item) - logger.debug("Processed %s item %s", item.__tablename__, item.id) + logger.debug( + "Processed %s item %s in %.3f", + item.__tablename__, + item.id, + time.time() - start_time, + ) def stop(self): self._running = False @@ -416,3 +424,40 @@ def resolve_now_placeholders(update_values: _ResolveNowInput, now: datetime): for key, value in update_values.items(): if value is NOW_PLACEHOLDER: update_values[key] = now + + +def log_lock_token_mismatch( + logger: logging.Logger, + item: PipelineItem, + action: str = "process", +) -> None: + logger.warning( + "Failed to %s %s item %s: lock_token mismatch." + " The item is expected to be processed and updated on another fetch iteration.", + action, + item.__tablename__, + item.id, + ) + + +def log_lock_token_changed_after_processing( + logger: logging.Logger, + item: PipelineItem, + action: str = "update", + expected_outcome: str = "updated", +) -> None: + logger.warning( + "Failed to %s %s item %s after processing: lock_token changed." + " The item is expected to be processed and %s on another fetch iteration.", + action, + item.__tablename__, + item.id, + expected_outcome, + ) + + +def log_lock_token_changed_on_reset(logger: logging.Logger) -> None: + logger.warning( + "Failed to reset lock: lock_token changed." + " The item is expected to be processed and updated on another fetch iteration." + ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py b/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py index 0ee2975eb2..69ce3e7998 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/compute_groups.py @@ -20,6 +20,8 @@ PipelineItem, UpdateMapDateTime, Worker, + log_lock_token_changed_after_processing, + log_lock_token_mismatch, resolve_now_placeholders, set_processed_update_map_fields, set_unlock_update_map_fields, @@ -194,12 +196,7 @@ async def process(self, item: PipelineItem): ) compute_group_model = res.unique().scalar_one_or_none() if compute_group_model is None: - logger.warning( - "Failed to process %s item %s: lock_token mismatch." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_mismatch(logger, item) return result = _TerminateResult() @@ -228,12 +225,7 @@ async def process(self, item: PipelineItem): ) updated_ids = list(res.scalars().all()) if len(updated_ids) == 0: - logger.warning( - "Failed to update %s item %s after processing: lock_token changed." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_changed_after_processing(logger, item) return if not result.instances_update_map: return @@ -249,6 +241,8 @@ async def process(self, item: PipelineItem): instance_model=instance_model, old_status=instance_model.status, new_status=InstanceStatus.TERMINATED, + termination_reason=instance_model.termination_reason, + termination_reason_message=instance_model.termination_reason_message, ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/fleets.py b/src/dstack/_internal/server/background/pipeline_tasks/fleets.py index 55ffcd7f94..2a63e21bd5 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/fleets.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/fleets.py @@ -2,13 +2,17 @@ import uuid from dataclasses import dataclass, field from datetime import timedelta -from typing import Sequence, TypedDict +from typing import Optional, Sequence, TypedDict from sqlalchemy import or_, select, update from sqlalchemy.ext.asyncio.session import AsyncSession from sqlalchemy.orm import joinedload, load_only, selectinload -from dstack._internal.core.models.fleets import FleetSpec, FleetStatus +from dstack._internal.core.models.fleets import ( + FleetSpec, + FleetStatus, + InstanceGroupPlacement, +) from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason from dstack._internal.core.models.runs import RunStatus from dstack._internal.server.background.pipeline_tasks.base import ( @@ -20,6 +24,9 @@ PipelineItem, UpdateMapDateTime, Worker, + log_lock_token_changed_after_processing, + log_lock_token_changed_on_reset, + log_lock_token_mismatch, resolve_now_placeholders, set_processed_update_map_fields, set_unlock_update_map_fields, @@ -210,77 +217,35 @@ async def process(self, item: PipelineItem): ) fleet_model = res.unique().scalar_one_or_none() if fleet_model is None: - logger.warning( - "Failed to process %s item %s: lock_token mismatch." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_mismatch(logger, item) return - instance_lock, _ = get_locker(get_db().dialect_name).get_lockset( - InstanceModel.__tablename__ - ) - async with instance_lock: - res = await session.execute( - select(InstanceModel) - .where( - InstanceModel.fleet_id == item.id, - InstanceModel.deleted == False, - # TODO: Lock instance models in the DB - # or_( - # InstanceModel.lock_expires_at.is_(None), - # InstanceModel.lock_expires_at < get_current_datetime(), - # ), - # or_( - # InstanceModel.lock_owner.is_(None), - # InstanceModel.lock_owner == FleetPipeline.__name__, - # ), - ) - .with_for_update(skip_locked=True, key_share=True) + # Lock instance only if consolidation is needed. + locked_instance_ids: set[uuid.UUID] = set() + consolidation_fleet_spec = _get_fleet_spec_if_ready_for_consolidation(fleet_model) + consolidation_instances = None + if consolidation_fleet_spec is not None: + consolidation_instances = await _lock_fleet_instances_for_consolidation( + session=session, + item=item, ) - locked_instance_models = res.scalars().all() - if len(fleet_model.instances) != len(locked_instance_models): - logger.debug( - "Failed to lock fleet %s instances. The fleet will be processed later.", - item.id, - ) - now = get_current_datetime() - # Keep `lock_owner` so that `InstancePipeline` sees that the fleet is being locked - # but unset `lock_expires_at` to process the item again ASAP (after `min_processing_interval`). - # Unset `lock_token` so that heartbeater can no longer update the item. - res = await session.execute( - update(FleetModel) - .where( - FleetModel.id == item.id, - FleetModel.lock_token == item.lock_token, - ) - .values( - lock_expires_at=None, - lock_token=None, - last_processed_at=now, - ) - ) - if res.rowcount == 0: # pyright: ignore[reportAttributeAccessIssue] - logger.warning( - "Failed to reset lock: lock_token changed." - " The item is expected to be processed and updated on another fetch iteration." - ) + if consolidation_instances is None: return + locked_instance_ids = {instance.id for instance in consolidation_instances} - # TODO: Lock instance models in the DB - # for instance_model in locked_instance_models: - # instance_model.lock_expires_at = item.lock_expires_at - # instance_model.lock_token = item.lock_token - # instance_model.lock_owner = FleetPipeline.__name__ - # await session.commit() - - result = await _process_fleet(fleet_model) + result = await _process_fleet( + fleet_model, + consolidation_fleet_spec=consolidation_fleet_spec, + consolidation_instances=consolidation_instances, + ) fleet_update_map = _FleetUpdateMap() fleet_update_map.update(result.fleet_update_map) set_processed_update_map_fields(fleet_update_map) set_unlock_update_map_fields(fleet_update_map) - instance_update_rows = _build_instance_update_rows(result.instance_id_to_update_map) + instance_update_rows = _build_instance_update_rows( + result.instance_id_to_update_map, + unlock_instance_ids=locked_instance_ids, + ) async with get_session_ctx() as session: now = get_current_datetime() @@ -297,12 +262,13 @@ async def process(self, item: PipelineItem): ) updated_ids = list(res.scalars().all()) if len(updated_ids) == 0: - logger.warning( - "Failed to update %s item %s after processing: lock_token changed." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_changed_after_processing(logger, item) + if locked_instance_ids: + await _unlock_fleet_locked_instances( + session=session, + item=item, + locked_instance_ids=locked_instance_ids, + ) # TODO: Clean up fleet. return @@ -314,14 +280,14 @@ async def process(self, item: PipelineItem): ) if instance_update_rows: await session.execute( - update(InstanceModel).execution_options(synchronize_session=False), + update(InstanceModel), instance_update_rows, ) - if result.new_instances_count > 0: + if len(result.new_instance_creates) > 0: await _create_missing_fleet_instances( session=session, fleet_model=fleet_model, - new_instances_count=result.new_instances_count, + new_instance_creates=result.new_instance_creates, ) emit_fleet_status_change_event( session=session, @@ -339,9 +305,10 @@ class _FleetUpdateMap(ItemUpdateMap, total=False): deleted_at: UpdateMapDateTime consolidation_attempt: int last_consolidated_at: UpdateMapDateTime + current_master_instance_id: Optional[uuid.UUID] -class _InstanceUpdateMap(TypedDict, total=False): +class _InstanceUpdateMap(ItemUpdateMap, total=False): status: InstanceStatus termination_reason: InstanceTerminationReason termination_reason_message: str @@ -351,51 +318,154 @@ class _InstanceUpdateMap(TypedDict, total=False): id: uuid.UUID +def _get_fleet_spec_if_ready_for_consolidation(fleet_model: FleetModel) -> Optional[FleetSpec]: + if fleet_model.status == FleetStatus.TERMINATING: + return None + consolidation_fleet_spec = get_fleet_spec(fleet_model) + if ( + consolidation_fleet_spec.configuration.nodes is None + or consolidation_fleet_spec.autocreated + ): + return None + if not _is_fleet_ready_for_consolidation(fleet_model): + return None + return consolidation_fleet_spec + + +async def _lock_fleet_instances_for_consolidation( + session: AsyncSession, + item: PipelineItem, +) -> Optional[list[InstanceModel]]: + instance_lock, _ = get_locker(get_db().dialect_name).get_lockset(InstanceModel.__tablename__) + async with instance_lock: + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.fleet_id == item.id, + InstanceModel.deleted == False, + or_( + InstanceModel.lock_expires_at.is_(None), + InstanceModel.lock_expires_at < get_current_datetime(), + ), + or_( + InstanceModel.lock_owner.is_(None), + InstanceModel.lock_owner == FleetPipeline.__name__, + ), + ) + .with_for_update(skip_locked=True, key_share=True) + ) + locked_instance_models = list(res.scalars().all()) + locked_instance_ids = {instance_model.id for instance_model in locked_instance_models} + + res = await session.execute( + select(InstanceModel.id).where( + InstanceModel.fleet_id == item.id, + InstanceModel.deleted == False, + ) + ) + current_instance_ids = set(res.scalars().all()) + if current_instance_ids != locked_instance_ids: + logger.debug( + "Failed to lock fleet %s instances. The fleet will be processed later.", + item.id, + ) + # Keep `lock_owner` so that `InstancePipeline` sees that the fleet is being locked + # but unset `lock_expires_at` to process the item again ASAP (after `min_processing_interval`). + # Unset `lock_token` so that heartbeater can no longer update the item. + res = await session.execute( + update(FleetModel) + .where( + FleetModel.id == item.id, + FleetModel.lock_token == item.lock_token, + ) + .values( + lock_expires_at=None, + lock_token=None, + last_processed_at=get_current_datetime(), + ) + .returning(FleetModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_on_reset(logger) + return None + + for instance_model in locked_instance_models: + instance_model.lock_expires_at = item.lock_expires_at + instance_model.lock_token = item.lock_token + instance_model.lock_owner = FleetPipeline.__name__ + await session.commit() + return locked_instance_models + + @dataclass class _ProcessResult: fleet_update_map: _FleetUpdateMap = field(default_factory=_FleetUpdateMap) instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap] = field(default_factory=dict) - new_instances_count: int = 0 + new_instance_creates: list["_NewInstanceCreate"] = field(default_factory=list) + + +class _NewInstanceCreate(TypedDict): + id: uuid.UUID + instance_num: int @dataclass class _MaintainNodesResult: instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap] = field(default_factory=dict) - new_instances_count: int = 0 + new_instance_creates: list[_NewInstanceCreate] = field(default_factory=list) changes_required: bool = False @property def has_changes(self) -> bool: - return len(self.instance_id_to_update_map) > 0 or self.new_instances_count > 0 + return len(self.instance_id_to_update_map) > 0 or len(self.new_instance_creates) > 0 -async def _process_fleet(fleet_model: FleetModel) -> _ProcessResult: - result = _consolidate_fleet_state_with_spec(fleet_model) - if result.new_instances_count > 0: - # Avoid deleting fleets that are about to provision new instances. - return result - delete = _should_delete_fleet(fleet_model) - if delete: +async def _process_fleet( + fleet_model: FleetModel, + consolidation_fleet_spec: Optional[FleetSpec] = None, + consolidation_instances: Optional[Sequence[InstanceModel]] = None, +) -> _ProcessResult: + result = _ProcessResult() + effective_instances = list(consolidation_instances or fleet_model.instances) + if consolidation_fleet_spec is not None: + result = _consolidate_fleet_state_with_spec( + fleet_model, + consolidation_fleet_spec=consolidation_fleet_spec, + consolidation_instances=effective_instances, + ) + if len(result.new_instance_creates) == 0 and _should_delete_fleet(fleet_model): result.fleet_update_map["status"] = FleetStatus.TERMINATED result.fleet_update_map["deleted"] = True result.fleet_update_map["deleted_at"] = NOW_PLACEHOLDER + _set_fail_instances_on_master_bootstrap_failure( + fleet_model=fleet_model, + instance_models=effective_instances, + instance_id_to_update_map=result.instance_id_to_update_map, + ) + _set_current_master_instance_id( + fleet_model=fleet_model, + fleet_update_map=result.fleet_update_map, + instance_models=effective_instances, + instance_id_to_update_map=result.instance_id_to_update_map, + new_instance_creates=result.new_instance_creates, + ) return result -def _consolidate_fleet_state_with_spec(fleet_model: FleetModel) -> _ProcessResult: +def _consolidate_fleet_state_with_spec( + fleet_model: FleetModel, + consolidation_fleet_spec: FleetSpec, + consolidation_instances: Sequence[InstanceModel], +) -> _ProcessResult: result = _ProcessResult() - if fleet_model.status == FleetStatus.TERMINATING: - return result - fleet_spec = get_fleet_spec(fleet_model) - if fleet_spec.configuration.nodes is None or fleet_spec.autocreated: - # Only explicitly created cloud fleets are consolidated. - return result - if not _is_fleet_ready_for_consolidation(fleet_model): - return result - maintain_nodes_result = _maintain_fleet_nodes_in_min_max_range(fleet_model, fleet_spec) + maintain_nodes_result = _maintain_fleet_nodes_in_min_max_range( + instances=consolidation_instances, + fleet_spec=consolidation_fleet_spec, + ) if maintain_nodes_result.has_changes: result.instance_id_to_update_map = maintain_nodes_result.instance_id_to_update_map - result.new_instances_count = maintain_nodes_result.new_instances_count + result.new_instance_creates = maintain_nodes_result.new_instance_creates if maintain_nodes_result.changes_required: result.fleet_update_map["consolidation_attempt"] = fleet_model.consolidation_attempt + 1 else: @@ -431,7 +501,7 @@ def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta: def _maintain_fleet_nodes_in_min_max_range( - fleet_model: FleetModel, + instances: Sequence[InstanceModel], fleet_spec: FleetSpec, ) -> _MaintainNodesResult: """ @@ -439,7 +509,7 @@ def _maintain_fleet_nodes_in_min_max_range( """ assert fleet_spec.configuration.nodes is not None result = _MaintainNodesResult() - for instance in fleet_model.instances: + for instance in instances: # Delete terminated but not deleted instances since # they are going to be replaced with new pending instances. if instance.status == InstanceStatus.TERMINATED and not instance.deleted: @@ -449,13 +519,19 @@ def _maintain_fleet_nodes_in_min_max_range( "deleted_at": NOW_PLACEHOLDER, } active_instances = [ - i for i in fleet_model.instances if i.status != InstanceStatus.TERMINATED and not i.deleted + i for i in instances if i.status != InstanceStatus.TERMINATED and not i.deleted ] active_instances_num = len(active_instances) if active_instances_num < fleet_spec.configuration.nodes.min: result.changes_required = True nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num - result.new_instances_count = nodes_missing + taken_instance_nums = {instance.instance_num for instance in active_instances} + for _ in range(nodes_missing): + instance_num = get_next_instance_num(taken_instance_nums) + taken_instance_nums.add(instance_num) + result.new_instance_creates.append( + _NewInstanceCreate(id=uuid.uuid4(), instance_num=instance_num) + ) return result if ( fleet_spec.configuration.nodes.max is None @@ -467,7 +543,7 @@ def _maintain_fleet_nodes_in_min_max_range( # or if nodes.max is updated. result.changes_required = True nodes_redundant = active_instances_num - fleet_spec.configuration.nodes.max - for instance in fleet_model.instances: + for instance in instances: if nodes_redundant == 0: break if instance.status == InstanceStatus.IDLE: @@ -506,42 +582,59 @@ def _should_delete_fleet(fleet_model: FleetModel) -> bool: def _build_instance_update_rows( instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], + unlock_instance_ids: set[uuid.UUID], ) -> list[_InstanceUpdateMap]: instance_update_rows = [] - for instance_id, instance_update_map in instance_id_to_update_map.items(): + for instance_id in sorted(instance_id_to_update_map.keys() | unlock_instance_ids): + instance_update_map = instance_id_to_update_map.get(instance_id) update_row = _InstanceUpdateMap() - update_row.update(instance_update_map) + if instance_update_map is not None: + update_row.update(instance_update_map) + if instance_id in unlock_instance_ids: + set_unlock_update_map_fields(update_row) update_row["id"] = instance_id set_processed_update_map_fields(update_row) instance_update_rows.append(update_row) return instance_update_rows +async def _unlock_fleet_locked_instances( + session: AsyncSession, + item: PipelineItem, + locked_instance_ids: set[uuid.UUID], +) -> None: + await session.execute( + update(InstanceModel) + .where( + InstanceModel.id.in_(locked_instance_ids), + InstanceModel.lock_token == item.lock_token, + InstanceModel.lock_owner == FleetPipeline.__name__, + ) + .values( + lock_expires_at=None, + lock_token=None, + lock_owner=None, + ) + ) + + async def _create_missing_fleet_instances( session: AsyncSession, fleet_model: FleetModel, - new_instances_count: int, + new_instance_creates: Sequence[_NewInstanceCreate], ): fleet_spec = get_fleet_spec(fleet_model) - res = await session.execute( - select(InstanceModel.instance_num).where( - InstanceModel.fleet_id == fleet_model.id, - InstanceModel.deleted == False, - ) - ) - taken_instance_nums = set(res.scalars().all()) - for _ in range(new_instances_count): - instance_num = get_next_instance_num(taken_instance_nums) + for new_instance_create in new_instance_creates: instance_model = create_fleet_instance_model( session=session, project=fleet_model.project, # TODO: Store fleet.user and pass it instead of the project owner. username=fleet_model.project.owner.name, spec=fleet_spec, - instance_num=instance_num, + instance_num=new_instance_create["instance_num"], + instance_id=new_instance_create["id"], ) instance_model.fleet_id = fleet_model.id - taken_instance_nums.add(instance_num) events.emit( session=session, message=( @@ -553,6 +646,173 @@ async def _create_missing_fleet_instances( ) logger.info( "Added %d instances to fleet %s", - new_instances_count, + len(new_instance_creates), fleet_model.name, ) + + +def _set_fail_instances_on_master_bootstrap_failure( + fleet_model: FleetModel, + instance_models: Sequence[InstanceModel], + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], +) -> None: + """ + Terminates instances with MASTER_FAILED if the master dies with NO_OFFERS in a cluster with node.min == 0. + This is needed to avoid master re-election loop and fail fast. + """ + fleet_spec = get_fleet_spec(fleet_model) + if ( + not _is_cloud_cluster_fleet_spec(fleet_spec) + or fleet_spec.configuration.nodes is None + or fleet_spec.configuration.nodes.min != 0 + or fleet_model.current_master_instance_id is None + ): + return + + current_master_instance_model = None + for instance_model in instance_models: + if instance_model.id == fleet_model.current_master_instance_id: + current_master_instance_model = instance_model + break + if current_master_instance_model is None: + return + + if ( + current_master_instance_model.status != InstanceStatus.TERMINATED + or current_master_instance_model.termination_reason != InstanceTerminationReason.NO_OFFERS + ): + return + + surviving_instance_models = _get_surviving_instance_models_after_updates( + instance_models=instance_models, + instance_id_to_update_map=instance_id_to_update_map, + ) + if any( + instance_model.status not in InstanceStatus.finished_statuses() + and instance_model.job_provisioning_data is not None + for instance_model in surviving_instance_models + ): + # It should not be possible to provision non-master instances ahead of master + # but we still safe-guard against the case when there can be other instances provisioned. + return + + for instance_model in surviving_instance_models: + if ( + instance_model.id == current_master_instance_model.id + or instance_model.status in InstanceStatus.finished_statuses() + ): + continue + update_map = instance_id_to_update_map.setdefault(instance_model.id, _InstanceUpdateMap()) + update_map["status"] = InstanceStatus.TERMINATED + update_map["termination_reason"] = InstanceTerminationReason.MASTER_FAILED + + +def _set_current_master_instance_id( + fleet_model: FleetModel, + fleet_update_map: _FleetUpdateMap, + instance_models: Sequence[InstanceModel], + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], + new_instance_creates: Sequence[_NewInstanceCreate], +) -> None: + """ + Sets `current_master_instance_id` for `fleet_model`. + Master instance can be changed if the previous master is gone. + If there are no active instances, newly selected master may change backend/region/az/placement. + """ + fleet_spec = get_fleet_spec(fleet_model) + if not _is_cloud_cluster_fleet_spec(fleet_spec): + fleet_update_map["current_master_instance_id"] = None + return + surviving_instance_models = _get_surviving_instance_models_after_updates( + instance_models=instance_models, + instance_id_to_update_map=instance_id_to_update_map, + ) + current_master_instance_id = _select_current_master_instance_id( + current_master_instance_id=fleet_model.current_master_instance_id, + surviving_instance_models=surviving_instance_models, + instance_id_to_update_map=instance_id_to_update_map, + new_instance_creates=new_instance_creates, + ) + fleet_update_map["current_master_instance_id"] = current_master_instance_id + + +def _get_surviving_instance_models_after_updates( + instance_models: Sequence[InstanceModel], + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], +) -> list[InstanceModel]: + surviving_instance_models = [] + for instance_model in sorted(instance_models, key=lambda i: (i.instance_num, i.created_at)): + instance_update_map = instance_id_to_update_map.get(instance_model.id) + if instance_update_map is not None and instance_update_map.get("deleted"): + continue + surviving_instance_models.append(instance_model) + return surviving_instance_models + + +def _select_current_master_instance_id( + current_master_instance_id: Optional[uuid.UUID], + surviving_instance_models: Sequence[InstanceModel], + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], + new_instance_creates: Sequence[_NewInstanceCreate], +) -> Optional[uuid.UUID]: + # Keep the current master stable while it is still alive so InstancePipeline + # does not see fleet-wide election churn between provisioning attempts. + if current_master_instance_id is not None: + for instance_model in surviving_instance_models: + if ( + instance_model.id == current_master_instance_id + and _get_effective_instance_status( + instance_model, + instance_id_to_update_map=instance_id_to_update_map, + ) + not in InstanceStatus.finished_statuses() + ): + return instance_model.id + + # If the old master is gone, prefer a surviving provisioned instance so we + # keep following an already-established cluster placement decision. + for instance_model in surviving_instance_models: + if ( + _get_effective_instance_status( + instance_model, + instance_id_to_update_map=instance_id_to_update_map, + ) + not in InstanceStatus.finished_statuses() + and instance_model.job_provisioning_data is not None + ): + return instance_model.id + + # Prefer existing surviving instances over freshly planned replacements to + # avoid election churn during min-nodes backfill. + for instance_model in surviving_instance_models: + if ( + _get_effective_instance_status( + instance_model, + instance_id_to_update_map=instance_id_to_update_map, + ) + not in InstanceStatus.finished_statuses() + ): + return instance_model.id + + for new_instance_create in sorted(new_instance_creates, key=lambda i: i["instance_num"]): + return new_instance_create["id"] + + return None + + +def _get_effective_instance_status( + instance_model: InstanceModel, + instance_id_to_update_map: dict[uuid.UUID, _InstanceUpdateMap], +) -> InstanceStatus: + update_map = instance_id_to_update_map.get(instance_model.id) + if update_map is None: + return instance_model.status + return update_map.get("status", instance_model.status) + + +def _is_cloud_cluster_fleet_spec(fleet_spec: FleetSpec) -> bool: + configuration = fleet_spec.configuration + return ( + configuration.placement == InstanceGroupPlacement.CLUSTER + and configuration.ssh_config is None + ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/gateways.py b/src/dstack/_internal/server/background/pipeline_tasks/gateways.py index 2d5f0a947b..81ba2ae708 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/gateways.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/gateways.py @@ -18,6 +18,8 @@ Pipeline, PipelineItem, Worker, + log_lock_token_changed_after_processing, + log_lock_token_mismatch, resolve_now_placeholders, set_processed_update_map_fields, set_unlock_update_map_fields, @@ -219,12 +221,7 @@ async def _process_submitted_item(item: GatewayPipelineItem): ) gateway_model = res.unique().scalar_one_or_none() if gateway_model is None: - logger.warning( - "Failed to process %s item %s: lock_token mismatch." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_mismatch(logger, item) return result = await _process_submitted_gateway(gateway_model) @@ -251,12 +248,7 @@ async def _process_submitted_item(item: GatewayPipelineItem): ) updated_ids = list(res.scalars().all()) if len(updated_ids) == 0: - logger.warning( - "Failed to update %s item %s after processing: lock_token changed." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_changed_after_processing(logger, item) # TODO: Clean up gateway_compute_model. return emit_gateway_status_change_event( @@ -345,12 +337,7 @@ async def _process_provisioning_item(item: GatewayPipelineItem): ) gateway_model = res.unique().scalar_one_or_none() if gateway_model is None: - logger.warning( - "Failed to process %s item %s: lock_token mismatch." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_mismatch(logger, item) return result = await _process_provisioning_gateway(gateway_model) @@ -372,12 +359,7 @@ async def _process_provisioning_item(item: GatewayPipelineItem): ) updated_ids = list(res.scalars().all()) if len(updated_ids) == 0: - logger.warning( - "Failed to update %s item %s after processing: lock_token changed." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_changed_after_processing(logger, item) return emit_gateway_status_change_event( session=session, @@ -464,12 +446,7 @@ async def _process_to_be_deleted_item(item: GatewayPipelineItem): ) gateway_model = res.unique().scalar_one_or_none() if gateway_model is None: - logger.warning( - "Failed to process %s item %s: lock_token mismatch." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_mismatch(logger, item) return result = await _process_to_be_deleted_gateway(gateway_model) @@ -485,11 +462,11 @@ async def _process_to_be_deleted_item(item: GatewayPipelineItem): ) deleted_ids = list(res.scalars().all()) if len(deleted_ids) == 0: - logger.warning( - "Failed to delete %s item %s after processing: lock_token changed." - " The item is expected to be processed and deleted on another fetch iteration.", - item.__tablename__, - item.id, + log_lock_token_changed_after_processing( + logger, + item, + action="delete", + expected_outcome="deleted", ) return events.emit( @@ -514,12 +491,7 @@ async def _process_to_be_deleted_item(item: GatewayPipelineItem): ) updated_ids = list(res.scalars().all()) if len(updated_ids) == 0: - logger.warning( - "Failed to update %s item %s after processing: lock_token changed." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_changed_after_processing(logger, item) return if result.gateway_compute_update_map: diff --git a/src/dstack/_internal/server/background/pipeline_tasks/instances/__init__.py b/src/dstack/_internal/server/background/pipeline_tasks/instances/__init__.py new file mode 100644 index 0000000000..b5289e05e9 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/instances/__init__.py @@ -0,0 +1,476 @@ +import asyncio +import uuid +from dataclasses import dataclass +from datetime import timedelta +from typing import Optional, Sequence + +from sqlalchemy import and_, not_, or_, select, update +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload, load_only + +from dstack._internal.core.models.health import HealthStatus +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.server.background.pipeline_tasks.base import ( + Fetcher, + Heartbeater, + Pipeline, + PipelineItem, + Worker, + log_lock_token_changed_after_processing, + log_lock_token_mismatch, + resolve_now_placeholders, + set_processed_update_map_fields, + set_unlock_update_map_fields, +) +from dstack._internal.server.background.pipeline_tasks.instances.check import ( + check_instance, + process_idle_timeout, +) +from dstack._internal.server.background.pipeline_tasks.instances.cloud_provisioning import ( + create_cloud_instance, +) +from dstack._internal.server.background.pipeline_tasks.instances.common import ( + ProcessResult, +) +from dstack._internal.server.background.pipeline_tasks.instances.ssh_deploy import ( + add_ssh_instance, +) +from dstack._internal.server.background.pipeline_tasks.instances.termination import ( + terminate_instance, +) +from dstack._internal.server.db import get_db, get_session_ctx +from dstack._internal.server.models import ( + InstanceHealthCheckModel, + InstanceModel, + JobModel, + ProjectModel, +) +from dstack._internal.server.services import events +from dstack._internal.server.services.instances import ( + emit_instance_status_change_event, + is_ssh_instance, +) +from dstack._internal.server.services.locking import get_locker +from dstack._internal.server.services.placement import ( + schedule_fleet_placement_groups_deletion, +) +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class InstancePipelineItem(PipelineItem): + status: InstanceStatus + + +class InstancePipeline(Pipeline[InstancePipelineItem]): + def __init__( + self, + workers_num: int = 20, + queue_lower_limit_factor: float = 0.5, + queue_upper_limit_factor: float = 2.0, + min_processing_interval: timedelta = timedelta(seconds=15), + lock_timeout: timedelta = timedelta(seconds=30), + heartbeat_trigger: timedelta = timedelta(seconds=15), + ) -> None: + super().__init__( + workers_num=workers_num, + queue_lower_limit_factor=queue_lower_limit_factor, + queue_upper_limit_factor=queue_upper_limit_factor, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeat_trigger=heartbeat_trigger, + ) + self.__heartbeater = Heartbeater[InstancePipelineItem]( + model_type=InstanceModel, + lock_timeout=self._lock_timeout, + heartbeat_trigger=self._heartbeat_trigger, + ) + self.__fetcher = InstanceFetcher( + queue=self._queue, + queue_desired_minsize=self._queue_desired_minsize, + min_processing_interval=self._min_processing_interval, + lock_timeout=self._lock_timeout, + heartbeater=self._heartbeater, + ) + self.__workers = [ + InstanceWorker(queue=self._queue, heartbeater=self._heartbeater) + for _ in range(self._workers_num) + ] + + @property + def hint_fetch_model_name(self) -> str: + return InstanceModel.__name__ + + @property + def _heartbeater(self) -> Heartbeater[InstancePipelineItem]: + return self.__heartbeater + + @property + def _fetcher(self) -> Fetcher[InstancePipelineItem]: + return self.__fetcher + + @property + def _workers(self) -> Sequence["InstanceWorker"]: + return self.__workers + + +class InstanceFetcher(Fetcher[InstancePipelineItem]): + def __init__( + self, + queue: asyncio.Queue[InstancePipelineItem], + queue_desired_minsize: int, + min_processing_interval: timedelta, + lock_timeout: timedelta, + heartbeater: Heartbeater[InstancePipelineItem], + queue_check_delay: float = 1.0, + ) -> None: + super().__init__( + queue=queue, + queue_desired_minsize=queue_desired_minsize, + min_processing_interval=min_processing_interval, + lock_timeout=lock_timeout, + heartbeater=heartbeater, + queue_check_delay=queue_check_delay, + ) + + @sentry_utils.instrument_named_task("pipeline_tasks.InstanceFetcher.fetch") + async def fetch(self, limit: int) -> list[InstancePipelineItem]: + instance_lock, _ = get_locker(get_db().dialect_name).get_lockset( + InstanceModel.__tablename__ + ) + async with instance_lock: + async with get_session_ctx() as session: + now = get_current_datetime() + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.status.in_( + [ + InstanceStatus.PENDING, + InstanceStatus.PROVISIONING, + InstanceStatus.BUSY, + InstanceStatus.IDLE, + InstanceStatus.TERMINATING, + ] + ), + not_( + and_( + InstanceModel.status == InstanceStatus.TERMINATING, + InstanceModel.compute_group_id.is_not(None), + ) + ), + InstanceModel.deleted == False, + or_( + InstanceModel.last_processed_at <= now - self._min_processing_interval, + InstanceModel.last_processed_at == InstanceModel.created_at, + ), + or_( + InstanceModel.lock_expires_at.is_(None), + InstanceModel.lock_expires_at < now, + ), + or_( + InstanceModel.lock_owner.is_(None), + InstanceModel.lock_owner == InstancePipeline.__name__, + ), + ) + .order_by(InstanceModel.last_processed_at.asc()) + .limit(limit) + .with_for_update(skip_locked=True, key_share=True, of=InstanceModel) + .options( + load_only( + InstanceModel.id, + InstanceModel.lock_token, + InstanceModel.lock_expires_at, + InstanceModel.status, + ) + ) + ) + instance_models = list(res.scalars().all()) + lock_expires_at = get_current_datetime() + self._lock_timeout + lock_token = uuid.uuid4() + items = [] + for instance_model in instance_models: + prev_lock_expired = instance_model.lock_expires_at is not None + instance_model.lock_expires_at = lock_expires_at + instance_model.lock_token = lock_token + instance_model.lock_owner = InstancePipeline.__name__ + items.append( + InstancePipelineItem( + __tablename__=InstanceModel.__tablename__, + id=instance_model.id, + lock_expires_at=lock_expires_at, + lock_token=lock_token, + prev_lock_expired=prev_lock_expired, + status=instance_model.status, + ) + ) + await session.commit() + return items + + +class InstanceWorker(Worker[InstancePipelineItem]): + def __init__( + self, + queue: asyncio.Queue[InstancePipelineItem], + heartbeater: Heartbeater[InstancePipelineItem], + ) -> None: + super().__init__( + queue=queue, + heartbeater=heartbeater, + ) + + @sentry_utils.instrument_named_task("pipeline_tasks.InstanceWorker.process") + async def process(self, item: InstancePipelineItem): + process_context: Optional[_ProcessContext] = None + if item.status == InstanceStatus.PENDING: + process_context = await _process_pending_item(item) + elif item.status == InstanceStatus.PROVISIONING: + process_context = await _process_provisioning_item(item) + elif item.status == InstanceStatus.IDLE: + process_context = await _process_idle_item(item) + elif item.status == InstanceStatus.BUSY: + process_context = await _process_busy_item(item) + elif item.status == InstanceStatus.TERMINATING: + process_context = await _process_terminating_item(item) + if process_context is None: + return + set_processed_update_map_fields(process_context.result.instance_update_map) + set_unlock_update_map_fields(process_context.result.instance_update_map) + await _apply_process_result( + item=item, + instance_model=process_context.instance_model, + result=process_context.result, + ) + + +@dataclass +class _ProcessContext: + instance_model: InstanceModel + result: ProcessResult + + +async def _process_pending_item(item: InstancePipelineItem) -> Optional[_ProcessContext]: + async with get_session_ctx() as session: + instance_model = await _refetch_locked_instance_for_pending_or_terminating( + session=session, + item=item, + ) + if instance_model is None: + log_lock_token_mismatch(logger, item) + return None + if is_ssh_instance(instance_model): + result = await add_ssh_instance(instance_model) + else: + result = await create_cloud_instance(instance_model) + return _ProcessContext(instance_model=instance_model, result=result) + + +async def _process_provisioning_item(item: InstancePipelineItem) -> Optional[_ProcessContext]: + async with get_session_ctx() as session: + instance_model = await _refetch_locked_instance_for_check(session=session, item=item) + if instance_model is None: + log_lock_token_mismatch(logger, item) + return None + result = await check_instance(instance_model) + return _ProcessContext(instance_model=instance_model, result=result) + + +async def _process_idle_item(item: InstancePipelineItem) -> Optional[_ProcessContext]: + async with get_session_ctx() as session: + instance_model = await _refetch_locked_instance_for_idle(session=session, item=item) + if instance_model is None: + log_lock_token_mismatch(logger, item) + return None + idle_result = await process_idle_timeout( + session=session, + instance_model=instance_model, + ) + if idle_result is not None: + return _ProcessContext(instance_model=instance_model, result=idle_result) + result = await check_instance(instance_model) + return _ProcessContext(instance_model=instance_model, result=result) + + +async def _process_busy_item(item: InstancePipelineItem) -> Optional[_ProcessContext]: + async with get_session_ctx() as session: + instance_model = await _refetch_locked_instance_for_check(session=session, item=item) + if instance_model is None: + log_lock_token_mismatch(logger, item) + return None + result = await check_instance(instance_model) + return _ProcessContext(instance_model=instance_model, result=result) + + +async def _process_terminating_item(item: InstancePipelineItem) -> Optional[_ProcessContext]: + async with get_session_ctx() as session: + instance_model = await _refetch_locked_instance_for_pending_or_terminating( + session=session, + item=item, + ) + if instance_model is None: + log_lock_token_mismatch(logger, item) + return None + result = await terminate_instance(instance_model) + return _ProcessContext(instance_model=instance_model, result=result) + + +async def _refetch_locked_instance_for_pending_or_terminating( + session: AsyncSession, item: InstancePipelineItem +) -> Optional[InstanceModel]: + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.id == item.id, + InstanceModel.lock_token == item.lock_token, + ) + .options(joinedload(InstanceModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) + .options(joinedload(InstanceModel.fleet)) + ) + return res.unique().scalar_one_or_none() + + +async def _refetch_locked_instance_for_idle( + session: AsyncSession, item: InstancePipelineItem +) -> Optional[InstanceModel]: + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.id == item.id, + InstanceModel.lock_token == item.lock_token, + ) + .options(joinedload(InstanceModel.project)) + .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) + .options(joinedload(InstanceModel.fleet)) + ) + return res.unique().scalar_one_or_none() + + +async def _refetch_locked_instance_for_check( + session: AsyncSession, item: InstancePipelineItem +) -> Optional[InstanceModel]: + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.id == item.id, + InstanceModel.lock_token == item.lock_token, + ) + .options( + joinedload(InstanceModel.project).load_only( + ProjectModel.id, + ProjectModel.ssh_public_key, + ProjectModel.ssh_private_key, + ) + ) + .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) + ) + return res.unique().scalar_one_or_none() + + +async def _apply_process_result( + item: InstancePipelineItem, + instance_model: InstanceModel, + result: ProcessResult, +) -> None: + async with get_session_ctx() as session: + if result.health_check_create is not None: + session.add(InstanceHealthCheckModel(**result.health_check_create)) + if result.new_placement_group_models: + session.add_all(result.new_placement_group_models) + if result.health_check_create is not None or result.new_placement_group_models: + await session.flush() + + now = get_current_datetime() + resolve_now_placeholders(result.instance_update_map, now=now) + + res = await session.execute( + update(InstanceModel) + .where( + InstanceModel.id == item.id, + InstanceModel.lock_token == item.lock_token, + ) + .values(**result.instance_update_map) + .returning(InstanceModel.id) + ) + updated_ids = list(res.scalars().all()) + if len(updated_ids) == 0: + log_lock_token_changed_after_processing(logger, item) + await session.rollback() + return + + if result.schedule_pg_deletion_fleet_id is not None: + await schedule_fleet_placement_groups_deletion( + session=session, + fleet_id=result.schedule_pg_deletion_fleet_id, + except_placement_group_ids=( + () + if result.schedule_pg_deletion_except_id is None + else (result.schedule_pg_deletion_except_id,) + ), + ) + + emit_instance_status_change_event( + session=session, + instance_model=instance_model, + old_status=instance_model.status, + new_status=result.instance_update_map.get("status", instance_model.status), + termination_reason=result.instance_update_map.get( + "termination_reason", instance_model.termination_reason + ), + termination_reason_message=result.instance_update_map.get( + "termination_reason_message", + instance_model.termination_reason_message, + ), + ) + _emit_instance_health_change_event( + session=session, + instance_model=instance_model, + old_health=instance_model.health, + new_health=result.instance_update_map.get("health", instance_model.health), + ) + _emit_instance_reachability_change_event( + session=session, + instance_model=instance_model, + old_status=instance_model.status, + old_unreachable=instance_model.unreachable, + new_unreachable=result.instance_update_map.get( + "unreachable", instance_model.unreachable + ), + ) + + +def _emit_instance_health_change_event( + session: AsyncSession, + instance_model: InstanceModel, + old_health: HealthStatus, + new_health: HealthStatus, +) -> None: + if old_health == new_health: + return + events.emit( + session, + f"Instance health changed {old_health.upper()} -> {new_health.upper()}", + actor=events.SystemActor(), + targets=[events.Target.from_model(instance_model)], + ) + + +def _emit_instance_reachability_change_event( + session: AsyncSession, + instance_model: InstanceModel, + old_status: InstanceStatus, + old_unreachable: bool, + new_unreachable: bool, +) -> None: + if not old_status.is_available() or old_unreachable == new_unreachable: + return + events.emit( + session, + "Instance became unreachable" if new_unreachable else "Instance became reachable", + actor=events.SystemActor(), + targets=[events.Target.from_model(instance_model)], + ) diff --git a/src/dstack/_internal/server/background/pipeline_tasks/instances/check.py b/src/dstack/_internal/server/background/pipeline_tasks/instances/check.py new file mode 100644 index 0000000000..d23d536cd1 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/instances/check.py @@ -0,0 +1,568 @@ +import logging +import uuid +from datetime import timedelta +from typing import Optional + +import gpuhunt +import requests +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import joinedload + +from dstack._internal.core.backends.base.backend import Backend +from dstack._internal.core.backends.base.compute import ( + get_dstack_runner_download_url, + get_dstack_runner_version, + get_dstack_shim_download_url, + get_dstack_shim_version, +) +from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT +from dstack._internal.core.errors import ProvisioningError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.health import HealthStatus +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.core.models.profiles import TerminationPolicy +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.server import settings as server_settings +from dstack._internal.server.background.pipeline_tasks.instances.common import ( + TERMINATION_DEADLINE_OFFSET, + HealthCheckCreate, + ProcessResult, + can_terminate_fleet_instances_on_idle_duration, + get_instance_idle_duration, + get_provisioning_deadline, + set_health_update, + set_status_update, + set_unreachable_update, +) +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.models import InstanceHealthCheckModel, InstanceModel, ProjectModel +from dstack._internal.server.schemas.instances import InstanceCheck +from dstack._internal.server.schemas.runner import ( + ComponentInfo, + ComponentStatus, + InstanceHealthResponse, +) +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services.instances import ( + get_instance_provisioning_data, + get_instance_ssh_private_keys, + is_ssh_instance, + remove_dangling_tasks_from_instance, +) +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.runner import client as runner_client +from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel +from dstack._internal.utils.common import get_current_datetime, get_or_error, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +async def process_idle_timeout( + session: AsyncSession, + instance_model: InstanceModel, +) -> Optional[ProcessResult]: + if not ( + instance_model.status == InstanceStatus.IDLE + and instance_model.termination_policy == TerminationPolicy.DESTROY_AFTER_IDLE + and not instance_model.jobs + ): + return None + # Do not terminate instances on idle duration if fleet is already at `nodes.min`. + # This is an optimization to avoid terminate-create loop. + # There may be race conditions since we don't take the fleet lock. + # That's ok: in the worst case we go below `nodes.min`, but + # the fleet consolidation logic will provision new nodes. + if ( + instance_model.fleet is not None + and not await can_terminate_fleet_instances_on_idle_duration( + session=session, + fleet_model=instance_model.fleet, + ) + ): + return None + + idle_duration = get_instance_idle_duration(instance_model) + if idle_duration <= timedelta(seconds=instance_model.termination_idle_time): + return None + + result = ProcessResult() + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.IDLE_TIMEOUT, + termination_reason_message=f"Instance idle for {idle_duration.seconds}s", + ) + return result + + +async def check_instance(instance_model: InstanceModel) -> ProcessResult: + result = ProcessResult() + if ( + instance_model.status == InstanceStatus.BUSY + and instance_model.jobs + and all(job.status.is_finished() for job in instance_model.jobs) + ): + # A busy instance could have no active jobs due to this bug: + # https://github.com/dstackai/dstack/issues/2068 + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.JOB_FINISHED, + ) + logger.warning( + "Detected busy instance %s with finished job. Marked as TERMINATING", + instance_model.name, + extra={ + "instance_name": instance_model.name, + "instance_status": instance_model.status.value, + }, + ) + return result + + job_provisioning_data = get_or_error(get_instance_provisioning_data(instance_model)) + if job_provisioning_data.hostname is None: + return await _process_wait_for_instance_provisioning_data( + instance_model=instance_model, + job_provisioning_data=job_provisioning_data, + ) + + if not job_provisioning_data.dockerized: + if instance_model.status == InstanceStatus.PROVISIONING: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.BUSY, + ) + return result + + check_instance_health = await _should_check_instance_health(instance_model.id) + instance_check = await _run_instance_check( + instance_model=instance_model, + job_provisioning_data=job_provisioning_data, + check_instance_health=check_instance_health, + ) + health_status = _get_health_status_for_instance_check( + instance_model=instance_model, + instance_check=instance_check, + check_instance_health=check_instance_health, + ) + _log_instance_check_result( + instance_model=instance_model, + instance_check=instance_check, + health_status=health_status, + check_instance_health=check_instance_health, + ) + + if instance_check.has_health_checks(): + # ensured by has_health_checks() + assert instance_check.health_response is not None + result.health_check_create = HealthCheckCreate( + instance_id=instance_model.id, + collected_at=get_current_datetime(), + status=health_status, + response=instance_check.health_response.json(), + ) + + set_health_update( + update_map=result.instance_update_map, + instance_model=instance_model, + health=health_status, + ) + set_unreachable_update( + update_map=result.instance_update_map, + instance_model=instance_model, + unreachable=not instance_check.reachable, + ) + + if instance_check.reachable: + result.instance_update_map["termination_deadline"] = None + if instance_model.status == InstanceStatus.PROVISIONING: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.IDLE if not instance_model.jobs else InstanceStatus.BUSY, + ) + return result + + now = get_current_datetime() + if not is_ssh_instance(instance_model) and instance_model.termination_deadline is None: + result.instance_update_map["termination_deadline"] = now + TERMINATION_DEADLINE_OFFSET + + if ( + instance_model.status == InstanceStatus.PROVISIONING + and instance_model.started_at is not None + ): + provisioning_deadline = get_provisioning_deadline( + instance_model=instance_model, + job_provisioning_data=job_provisioning_data, + ) + if now > provisioning_deadline: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.PROVISIONING_TIMEOUT, + termination_reason_message="Instance did not become reachable in time", + ) + elif instance_model.status.is_available(): + deadline = instance_model.termination_deadline + if deadline is not None and now > deadline: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.UNREACHABLE, + ) + return result + + +async def _should_check_instance_health(instance_id) -> bool: + health_check_cutoff = get_current_datetime() - timedelta( + seconds=server_settings.SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS + ) + async with get_session_ctx() as session: + res = await session.execute( + select(func.count(1)).where( + InstanceHealthCheckModel.instance_id == instance_id, + InstanceHealthCheckModel.collected_at > health_check_cutoff, + ) + ) + return res.scalar_one() == 0 + + +async def _run_instance_check( + instance_model: InstanceModel, + job_provisioning_data: JobProvisioningData, + check_instance_health: bool, +) -> InstanceCheck: + ssh_private_keys = get_instance_ssh_private_keys(instance_model) + instance_check = await run_async( + _check_instance_inner, + ssh_private_keys, + job_provisioning_data, + None, + instance=instance_model, + check_instance_health=check_instance_health, + ) + # May return False if fails to establish ssh connection. + if instance_check is False: + return InstanceCheck(reachable=False, message="SSH or tunnel error") + return instance_check + + +def _get_health_status_for_instance_check( + instance_model: InstanceModel, + instance_check: InstanceCheck, + check_instance_health: bool, +) -> HealthStatus: + if instance_check.reachable and check_instance_health: + return instance_check.get_health_status() + # Keep previous health status. + return instance_model.health + + +def _log_instance_check_result( + instance_model: InstanceModel, + instance_check: InstanceCheck, + health_status: HealthStatus, + check_instance_health: bool, +) -> None: + loglevel = logging.DEBUG + if not instance_check.reachable and instance_model.status.is_available(): + loglevel = logging.WARNING + elif check_instance_health and not health_status.is_healthy(): + loglevel = logging.WARNING + logger.log( + loglevel, + "Instance %s check: reachable=%s health_status=%s message=%r", + instance_model.name, + instance_check.reachable, + health_status.name, + instance_check.message, + extra={"instance_name": instance_model.name, "health_status": health_status}, + ) + + +async def _process_wait_for_instance_provisioning_data( + instance_model: InstanceModel, + job_provisioning_data: JobProvisioningData, +) -> ProcessResult: + result = ProcessResult() + logger.debug("Waiting for instance %s to become running", instance_model.name) + provisioning_deadline = get_provisioning_deadline( + instance_model=instance_model, + job_provisioning_data=job_provisioning_data, + ) + if get_current_datetime() > provisioning_deadline: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.PROVISIONING_TIMEOUT, + termination_reason_message="Backend did not complete provisioning in time", + ) + return result + + backend = await _get_backend_for_provisioning_wait( + project_id=instance_model.project_id, + backend_type=job_provisioning_data.backend, + ) + if backend is None: + logger.warning( + "Instance %s failed because instance's backend is not available", + instance_model.name, + ) + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message="Backend not available", + ) + return result + + try: + await run_async( + backend.compute().update_provisioning_data, + job_provisioning_data, + instance_model.project.ssh_public_key, + instance_model.project.ssh_private_key, + ) + result.instance_update_map["job_provisioning_data"] = job_provisioning_data.json() + except ProvisioningError as exc: + logger.warning( + "Error while waiting for instance %s to become running: %s", + instance_model.name, + repr(exc), + ) + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATING, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message="Error while waiting for instance to become running", + ) + except Exception: + logger.exception( + "Got exception when updating instance %s provisioning data", + instance_model.name, + ) + return result + + +async def _get_backend_for_provisioning_wait( + project_id: uuid.UUID, + backend_type: BackendType, +) -> Optional[Backend]: + async with get_session_ctx() as session: + res = await session.execute( + select(ProjectModel) + .where(ProjectModel.id == project_id) + .options(joinedload(ProjectModel.backends)) + ) + project_model = res.unique().scalar_one_or_none() + if project_model is None: + return None + return await backends_services.get_project_backend_by_type( + project=project_model, + backend_type=backend_type, + ) + + +@runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1) +def _check_instance_inner( + ports: dict[int, int], + *, + instance: InstanceModel, + check_instance_health: bool = False, +) -> InstanceCheck: + instance_health_response: Optional[InstanceHealthResponse] = None + shim_client = runner_client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT]) + method = shim_client.healthcheck + try: + healthcheck_response = method(unmask_exceptions=True) + if check_instance_health: + method = shim_client.get_instance_health + instance_health_response = method() + except requests.RequestException as exc: + template = "shim.%s(): request error: %s" + args = (method.__func__.__name__, exc) + logger.debug(template, *args) + return InstanceCheck(reachable=False, message=template % args) + except Exception as exc: + template = "shim.%s(): unexpected exception %s: %s" + args = (method.__func__.__name__, exc.__class__.__name__, exc) + logger.exception(template, *args) + return InstanceCheck(reachable=False, message=template % args) + + try: + remove_dangling_tasks_from_instance(shim_client, instance) + except Exception as exc: + logger.exception("%s: error removing dangling tasks: %s", fmt(instance), exc) + + # There should be no shim API calls after this function call since it can request shim restart. + _maybe_install_components(instance, shim_client) + return runner_client.healthcheck_response_to_instance_check( + healthcheck_response, + instance_health_response, + ) + + +def _maybe_install_components( + instance_model: InstanceModel, + shim_client: runner_client.ShimClient, +) -> None: + try: + components = shim_client.get_components() + except requests.RequestException as exc: + logger.warning( + "Instance %s: shim.get_components(): request error: %s", instance_model.name, exc + ) + return + if components is None: + logger.debug("Instance %s: no components info", instance_model.name) + return + + installed_shim_version: Optional[str] = None + installation_requested = False + + if (runner_info := components.runner) is not None: + installation_requested |= _maybe_install_runner(instance_model, shim_client, runner_info) + else: + logger.debug("Instance %s: no runner info", instance_model.name) + + if (shim_info := components.shim) is not None: + if shim_info.status == ComponentStatus.INSTALLED: + installed_shim_version = shim_info.version + installation_requested |= _maybe_install_shim(instance_model, shim_client, shim_info) + else: + logger.debug("Instance %s: no shim info", instance_model.name) + + # old shim without `dstack-shim` component and `/api/shutdown` support + # or the same version is already running + # or we just requested installation of at least one component + # or at least one component is already being installed + # or at least one shim task won't survive restart + running_shim_version = shim_client.get_version_string() + if ( + installed_shim_version is None + or installed_shim_version == running_shim_version + or installation_requested + or any(component.status == ComponentStatus.INSTALLING for component in components) + or not shim_client.is_safe_to_restart() + ): + return + + if shim_client.shutdown(force=False): + logger.debug( + "Instance %s: restarting shim %s -> %s", + instance_model.name, + running_shim_version, + installed_shim_version, + ) + else: + logger.debug("Instance %s: cannot restart shim", instance_model.name) + + +def _maybe_install_runner( + instance_model: InstanceModel, + shim_client: runner_client.ShimClient, + runner_info: ComponentInfo, +) -> bool: + # For developers: + # * To install the latest dev build for the current branch from the CI, + # set DSTACK_USE_LATEST_FROM_BRANCH=1. + # * To provide your own build, set DSTACK_RUNNER_VERSION_URL and DSTACK_RUNNER_DOWNLOAD_URL. + expected_version = get_dstack_runner_version() + if expected_version is None: + return False + + installed_version = runner_info.version + logger.debug( + "Instance %s: runner status=%s installed_version=%s", + instance_model.name, + runner_info.status.value, + installed_version or "(no version)", + ) + if runner_info.status == ComponentStatus.INSTALLING: + logger.debug("Instance %s: runner is already being installed", instance_model.name) + return False + if installed_version and installed_version == expected_version: + logger.debug("Instance %s: expected runner version already installed", instance_model.name) + return False + + url = get_dstack_runner_download_url( + arch=_get_instance_cpu_arch(instance_model), + version=expected_version, + ) + logger.debug( + "Instance %s: installing runner %s -> %s from %s", + instance_model.name, + installed_version or "(no version)", + expected_version, + url, + ) + try: + shim_client.install_runner(url) + return True + except requests.RequestException as exc: + logger.warning("Instance %s: shim.install_runner(): %s", instance_model.name, exc) + return False + + +def _maybe_install_shim( + instance_model: InstanceModel, + shim_client: runner_client.ShimClient, + shim_info: ComponentInfo, +) -> bool: + # For developers: + # * To install the latest dev build for the current branch from the CI, + # set DSTACK_USE_LATEST_FROM_BRANCH=1. + # * To provide your own build, set DSTACK_SHIM_VERSION_URL and DSTACK_SHIM_DOWNLOAD_URL. + expected_version = get_dstack_shim_version() + if expected_version is None: + return False + + installed_version = shim_info.version + logger.debug( + "Instance %s: shim status=%s installed_version=%s running_version=%s", + instance_model.name, + shim_info.status.value, + installed_version or "(no version)", + shim_client.get_version_string(), + ) + if shim_info.status == ComponentStatus.INSTALLING: + logger.debug("Instance %s: shim is already being installed", instance_model.name) + return False + if installed_version and installed_version == expected_version: + logger.debug("Instance %s: expected shim version already installed", instance_model.name) + return False + + url = get_dstack_shim_download_url( + arch=_get_instance_cpu_arch(instance_model), + version=expected_version, + ) + logger.debug( + "Instance %s: installing shim %s -> %s from %s", + instance_model.name, + installed_version or "(no version)", + expected_version, + url, + ) + try: + shim_client.install_shim(url) + return True + except requests.RequestException as exc: + logger.warning("Instance %s: shim.install_shim(): %s", instance_model.name, exc) + return False + + +def _get_instance_cpu_arch(instance_model: InstanceModel) -> Optional[gpuhunt.CPUArchitecture]: + job_provisioning_data = get_instance_provisioning_data(instance_model) + if job_provisioning_data is None: + return None + return job_provisioning_data.instance_type.resources.cpu_arch diff --git a/src/dstack/_internal/server/background/pipeline_tasks/instances/cloud_provisioning.py b/src/dstack/_internal/server/background/pipeline_tasks/instances/cloud_provisioning.py new file mode 100644 index 0000000000..4d2cbd8696 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/instances/cloud_provisioning.py @@ -0,0 +1,421 @@ +import uuid +from dataclasses import dataclass +from typing import Optional + +from pydantic import ValidationError +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import load_only +from sqlalchemy.orm.attributes import set_committed_value + +from dstack._internal.core.backends.base.compute import ( + ComputeWithCreateInstanceSupport, + ComputeWithPlacementGroupSupport, + generate_unique_placement_group_name, +) +from dstack._internal.core.backends.features import ( + BACKENDS_WITH_CREATE_INSTANCE_SUPPORT, + BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT, +) +from dstack._internal.core.errors import ( + BackendError, + PlacementGroupNotSupportedError, +) +from dstack._internal.core.models.instances import ( + InstanceOfferWithAvailability, + InstanceStatus, + InstanceTerminationReason, +) +from dstack._internal.core.models.placement import PlacementGroupConfiguration, PlacementStrategy +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.server import settings as server_settings +from dstack._internal.server.background.pipeline_tasks.base import NOW_PLACEHOLDER +from dstack._internal.server.background.pipeline_tasks.instances.common import ( + ProcessResult, + set_status_update, +) +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.models import FleetModel, InstanceModel, PlacementGroupModel +from dstack._internal.server.services.fleets import get_create_instance_offers, is_cloud_cluster +from dstack._internal.server.services.instances import ( + get_instance_configuration, + get_instance_profile, + get_instance_provisioning_data, + get_instance_requirements, +) +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.offers import get_instance_offer_with_restricted_az +from dstack._internal.server.services.placement import ( + get_fleet_placement_group_models, + placement_group_model_to_placement_group, + placement_group_model_to_placement_group_optional, +) +from dstack._internal.utils.common import get_or_error, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +@dataclass +class _ClusterMasterContext: + current_master_instance_model: InstanceModel + is_current_instance_master: bool + master_job_provisioning_data: Optional[JobProvisioningData] + + +async def create_cloud_instance(instance_model: InstanceModel) -> ProcessResult: + result = ProcessResult() + + try: + instance_configuration = get_instance_configuration(instance_model) + profile = get_instance_profile(instance_model) + requirements = get_instance_requirements(instance_model) + except ValidationError as exc: + logger.exception( + "%s: error parsing profile, requirements or instance configuration", + fmt(instance_model), + ) + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message=( + f"Error to parse profile, requirements or instance_configuration: {exc}" + ), + ) + return result + + cluster_context = None + placement_group_models: list[PlacementGroupModel] = [] + placement_group_model = None + master_job_provisioning_data = None + if instance_model.fleet is not None and is_cloud_cluster(instance_model.fleet): + cluster_context = await _get_cluster_master_context(instance_model) + if cluster_context is None: + # Waiting for the master + return result + placement_group_models, placement_group_model = await _get_cluster_placement_context( + instance_model=instance_model, + cluster_context=cluster_context, + ) + master_job_provisioning_data = cluster_context.master_job_provisioning_data + + offers = await get_create_instance_offers( + project=instance_model.project, + profile=profile, + requirements=requirements, + fleet_model=instance_model.fleet, + placement_group=placement_group_model_to_placement_group_optional(placement_group_model), + blocks="auto" if instance_model.total_blocks is None else instance_model.total_blocks, + exclude_not_available=True, + master_job_provisioning_data=master_job_provisioning_data, + infer_master_job_provisioning_data_from_fleet_instances=False, + ) + + # Limit number of offers tried to prevent long-running processing in case all offers fail. + for backend, instance_offer in offers[: server_settings.MAX_OFFERS_TRIED]: + if instance_offer.backend not in BACKENDS_WITH_CREATE_INSTANCE_SUPPORT: + continue + compute = backend.compute() + assert isinstance(compute, ComputeWithCreateInstanceSupport) + if master_job_provisioning_data is not None: + # `get_create_instance_offers()` already restricts backend and region from the master. + # Availability zone still has to be narrowed per offer. + instance_offer = get_instance_offer_with_restricted_az( + instance_offer=instance_offer, + master_job_provisioning_data=master_job_provisioning_data, + ) + if ( + cluster_context is not None + and cluster_context.is_current_instance_master + and instance_offer.backend in BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT + and isinstance(compute, ComputeWithPlacementGroupSupport) + and ( + compute.are_placement_groups_compatible_with_reservations(instance_offer.backend) + or instance_configuration.reservation is None + ) + ): + ( + placement_group_model, + created_placement_group_model, + ) = await _find_or_create_suitable_placement_group_model( + instance_model=instance_model, + placement_group_models=placement_group_models, + instance_offer=instance_offer, + compute=compute, + ) + if placement_group_model is None: + continue + if created_placement_group_model: + placement_group_models.append(placement_group_model) + result.new_placement_group_models.append(placement_group_model) + + logger.debug( + "Trying %s in %s/%s for $%0.4f per hour", + instance_offer.instance.name, + instance_offer.backend.value, + instance_offer.region, + instance_offer.price, + ) + try: + job_provisioning_data = await run_async( + compute.create_instance, + instance_offer, + instance_configuration, + placement_group_model_to_placement_group_optional(placement_group_model), + ) + except BackendError as exc: + logger.warning( + "%s launch in %s/%s failed: %s", + instance_offer.instance.name, + instance_offer.backend.value, + instance_offer.region, + repr(exc), + extra={"instance_name": instance_model.name}, + ) + continue + except Exception: + logger.exception( + "Got exception when launching %s in %s/%s", + instance_offer.instance.name, + instance_offer.backend.value, + instance_offer.region, + ) + continue + + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.PROVISIONING, + ) + result.instance_update_map["backend"] = backend.TYPE + result.instance_update_map["region"] = instance_offer.region + result.instance_update_map["price"] = instance_offer.price + result.instance_update_map["instance_configuration"] = instance_configuration.json() + result.instance_update_map["job_provisioning_data"] = job_provisioning_data.json() + result.instance_update_map["offer"] = instance_offer.json() + result.instance_update_map["total_blocks"] = instance_offer.total_blocks + result.instance_update_map["started_at"] = NOW_PLACEHOLDER + + if ( + instance_model.fleet_id is not None + and cluster_context is not None + and cluster_context.is_current_instance_master + ): + # Clean up placement groups that did not end up being used. + result.schedule_pg_deletion_fleet_id = instance_model.fleet_id + if placement_group_model is not None: + result.schedule_pg_deletion_except_id = placement_group_model.id + return result + + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.NO_OFFERS, + termination_reason_message="All offers failed" if offers else "No offers found", + ) + return result + + +async def _get_cluster_master_context( + instance_model: InstanceModel, +) -> Optional[_ClusterMasterContext]: + assert instance_model.fleet is not None and is_cloud_cluster(instance_model.fleet) + assert instance_model.fleet_id is not None + async with get_session_ctx() as session: + current_master_instance_model = await _load_current_master_instance( + session=session, + fleet_id=instance_model.fleet_id, + ) + if current_master_instance_model is None: + logger.debug( + "%s: waiting for fleet pipeline to elect current cluster master", + fmt(instance_model), + ) + return None + + is_current_instance_master = current_master_instance_model.id == instance_model.id + master_job_provisioning_data = None + if not is_current_instance_master: + if ( + current_master_instance_model.deleted + or current_master_instance_model.status == InstanceStatus.TERMINATED + ): + logger.debug( + "%s: waiting for fleet pipeline to replace current master %s", + fmt(instance_model), + current_master_instance_model.id, + ) + return None + master_job_provisioning_data = get_instance_provisioning_data( + current_master_instance_model + ) + if master_job_provisioning_data is None: + logger.debug( + "%s: waiting for current master %s to determine cluster placement", + fmt(instance_model), + current_master_instance_model.id, + ) + return None + + return _ClusterMasterContext( + current_master_instance_model=current_master_instance_model, + is_current_instance_master=is_current_instance_master, + master_job_provisioning_data=master_job_provisioning_data, + ) + + +async def _get_cluster_placement_context( + instance_model: InstanceModel, + cluster_context: _ClusterMasterContext, +) -> tuple[list[PlacementGroupModel], Optional[PlacementGroupModel]]: + assert instance_model.fleet is not None and is_cloud_cluster(instance_model.fleet) + assert instance_model.fleet_id is not None + async with get_session_ctx() as session: + placement_group_models = await get_fleet_placement_group_models( + session=session, + fleet_id=instance_model.fleet_id, + ) + placement_group_model = None + if not cluster_context.is_current_instance_master: + # Non-master instances only reuse the placement group chosen by the + # current master. They never create a new placement group themselves. + placement_group_model = _get_current_master_placement_group_model( + placement_group_models=placement_group_models, + fleet_id=instance_model.fleet_id, + ) + if placement_group_model is not None: + _populate_current_master_placement_group_relations( + placement_group_model=placement_group_model, + instance_model=instance_model, + ) + return placement_group_models, placement_group_model + + +async def _load_current_master_instance( + session: AsyncSession, + fleet_id: uuid.UUID, +) -> Optional[InstanceModel]: + res = await session.execute( + select(FleetModel.current_master_instance_id).where(FleetModel.id == fleet_id) + ) + current_master_instance_id = res.scalar_one_or_none() + if current_master_instance_id is None: + return None + res = await session.execute( + select(InstanceModel) + .where( + InstanceModel.id == current_master_instance_id, + ) + .options( + load_only( + InstanceModel.id, + InstanceModel.deleted, + InstanceModel.status, + InstanceModel.job_provisioning_data, + ) + ) + ) + return res.scalar_one_or_none() + + +def _get_current_master_placement_group_model( + placement_group_models: list[PlacementGroupModel], + fleet_id: uuid.UUID, +) -> Optional[PlacementGroupModel]: + if not placement_group_models: + return None + if len(placement_group_models) > 1: + logger.error( + "Expected 0 or 1 placement groups associated with fleet master %s, found %s." + " Using the first placement group for this provisioning attempt.", + fleet_id, + len(placement_group_models), + ) + return placement_group_models[0] + + +def _populate_current_master_placement_group_relations( + placement_group_model: PlacementGroupModel, + instance_model: InstanceModel, +) -> None: + # Placement groups are loaded in a separate session from the instance worker. + # Reattach the already-known project/fleet objects so later detached access + # can still build a PlacementGroup value object without lazy loading. + set_committed_value(placement_group_model, "project", instance_model.project) + if instance_model.fleet is not None: + set_committed_value(placement_group_model, "fleet", instance_model.fleet) + + +async def _find_or_create_suitable_placement_group_model( + instance_model: InstanceModel, + placement_group_models: list[PlacementGroupModel], + instance_offer: InstanceOfferWithAvailability, + compute: ComputeWithPlacementGroupSupport, +) -> tuple[Optional[PlacementGroupModel], bool]: + for placement_group_model in placement_group_models: + if compute.is_suitable_placement_group( + placement_group_model_to_placement_group(placement_group_model), + instance_offer, + ): + return placement_group_model, False + + assert instance_model.fleet is not None + placement_group_id = uuid.uuid4() + placement_group_name = generate_unique_placement_group_name( + project_name=instance_model.project.name, + fleet_name=instance_model.fleet.name, + ) + placement_group_model = PlacementGroupModel( + id=placement_group_id, + name=placement_group_name, + project=instance_model.project, + fleet=get_or_error(instance_model.fleet), + configuration=PlacementGroupConfiguration( + backend=instance_offer.backend, + region=instance_offer.region, + placement_strategy=PlacementStrategy.CLUSTER, + ).json(), + ) + placement_group = placement_group_model_to_placement_group(placement_group_model) + logger.debug( + "Creating placement group %s in %s/%s", + placement_group.name, + placement_group.configuration.backend.value, + placement_group.configuration.region, + ) + try: + provisioning_data = await run_async( + compute.create_placement_group, + placement_group, + instance_offer, + ) + except PlacementGroupNotSupportedError: + logger.debug( + "Skipping offer %s because placement group not supported", + instance_offer.instance.name, + ) + return None, False + except BackendError as exc: + logger.warning( + "Failed to create placement group %s in %s/%s: %r", + placement_group.name, + placement_group.configuration.backend.value, + placement_group.configuration.region, + exc, + ) + return None, False + except Exception: + logger.exception( + "Got exception when creating placement group %s in %s/%s", + placement_group.name, + placement_group.configuration.backend.value, + placement_group.configuration.region, + ) + return None, False + + placement_group.provisioning_data = provisioning_data + placement_group_model.provisioning_data = provisioning_data.json() + return placement_group_model, True diff --git a/src/dstack/_internal/server/background/pipeline_tasks/instances/common.py b/src/dstack/_internal/server/background/pipeline_tasks/instances/common.py new file mode 100644 index 0000000000..34e80311fd --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/instances/common.py @@ -0,0 +1,177 @@ +import datetime +import uuid +from dataclasses import dataclass, field +from datetime import timedelta +from typing import Optional, TypedDict, Union + +from paramiko.pkey import PKey +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.health import HealthStatus +from dstack._internal.core.models.instances import ( + InstanceStatus, + InstanceTerminationReason, + SSHKey, +) +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.server.background.pipeline_tasks.base import ( + ItemUpdateMap, + UpdateMapDateTime, +) +from dstack._internal.server.background.scheduled_tasks.common import get_provisioning_timeout +from dstack._internal.server.models import FleetModel, InstanceModel, PlacementGroupModel +from dstack._internal.server.services.fleets import get_fleet_spec +from dstack._internal.utils.common import UNSET, Unset, get_current_datetime +from dstack._internal.utils.ssh import pkey_from_str + +TERMINATION_DEADLINE_OFFSET = timedelta(minutes=20) +TERMINATION_RETRY_TIMEOUT = timedelta(seconds=30) +TERMINATION_RETRY_MAX_DURATION = timedelta(minutes=15) +PROVISIONING_TIMEOUT_SECONDS = 10 * 60 # 10 minutes in seconds + + +class InstanceUpdateMap(ItemUpdateMap, total=False): + status: InstanceStatus + unreachable: bool + started_at: UpdateMapDateTime + finished_at: UpdateMapDateTime + instance_configuration: str + termination_deadline: Optional[datetime.datetime] + termination_reason: Optional[InstanceTerminationReason] + termination_reason_message: Optional[str] + health: HealthStatus + first_termination_retry_at: UpdateMapDateTime + last_termination_retry_at: UpdateMapDateTime + backend: BackendType + backend_data: Optional[str] + offer: str + region: str + price: float + job_provisioning_data: str + total_blocks: int + busy_blocks: int + deleted: bool + deleted_at: UpdateMapDateTime + + +class HealthCheckCreate(TypedDict): + instance_id: uuid.UUID + collected_at: datetime.datetime + status: HealthStatus + response: str + + +@dataclass +class ProcessResult: + instance_update_map: InstanceUpdateMap = field(default_factory=InstanceUpdateMap) + health_check_create: Optional[HealthCheckCreate] = None + new_placement_group_models: list[PlacementGroupModel] = field(default_factory=list) + schedule_pg_deletion_fleet_id: Optional[uuid.UUID] = None + schedule_pg_deletion_except_id: Optional[uuid.UUID] = None + + +async def can_terminate_fleet_instances_on_idle_duration( + session: AsyncSession, + fleet_model: FleetModel, +) -> bool: + fleet_spec = get_fleet_spec(fleet_model) + if fleet_spec.configuration.nodes is None or fleet_spec.autocreated: + return True + res = await session.execute( + select(func.count(1)).where( + InstanceModel.fleet_id == fleet_model.id, + InstanceModel.deleted == False, + InstanceModel.status.not_in(InstanceStatus.finished_statuses()), + ) + ) + return res.scalar_one() > fleet_spec.configuration.nodes.min + + +def get_instance_idle_duration(instance_model: InstanceModel) -> datetime.timedelta: + last_time = instance_model.created_at + if instance_model.last_job_processed_at is not None: + last_time = instance_model.last_job_processed_at + return get_current_datetime() - last_time + + +def get_provisioning_deadline( + instance_model: InstanceModel, + job_provisioning_data: JobProvisioningData, +) -> datetime.datetime: + assert instance_model.started_at is not None + timeout_interval = get_provisioning_timeout( + backend_type=job_provisioning_data.get_base_backend(), + instance_type_name=job_provisioning_data.instance_type.name, + ) + return instance_model.started_at + timeout_interval + + +def next_termination_retry_at(last_termination_retry_at: datetime.datetime) -> datetime.datetime: + return last_termination_retry_at + TERMINATION_RETRY_TIMEOUT + + +def get_termination_deadline(first_termination_retry_at: datetime.datetime) -> datetime.datetime: + return first_termination_retry_at + TERMINATION_RETRY_MAX_DURATION + + +def ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]: + return [pkey_from_str(ssh_key.private) for ssh_key in ssh_keys if ssh_key.private is not None] + + +def set_status_update( + update_map: InstanceUpdateMap, + instance_model: InstanceModel, + new_status: InstanceStatus, + termination_reason: Union[Optional[InstanceTerminationReason], Unset] = UNSET, + termination_reason_message: Union[Optional[str], Unset] = UNSET, +) -> bool: + old_status = instance_model.status + changed = False + if old_status == new_status: + if not isinstance(termination_reason, Unset): + update_map["termination_reason"] = termination_reason + changed = True + if not isinstance(termination_reason_message, Unset): + update_map["termination_reason_message"] = termination_reason_message + changed = True + return changed + + effective_termination_reason = instance_model.termination_reason + if not isinstance(termination_reason, Unset): + effective_termination_reason = termination_reason + update_map["termination_reason"] = effective_termination_reason + changed = True + + effective_termination_reason_message = instance_model.termination_reason_message + if not isinstance(termination_reason_message, Unset): + effective_termination_reason_message = termination_reason_message + update_map["termination_reason_message"] = effective_termination_reason_message + changed = True + + update_map["status"] = new_status + changed = True + return changed + + +def set_health_update( + update_map: InstanceUpdateMap, + instance_model: InstanceModel, + health: HealthStatus, +) -> bool: + if instance_model.health == health: + return False + update_map["health"] = health + return True + + +def set_unreachable_update( + update_map: InstanceUpdateMap, + instance_model: InstanceModel, + unreachable: bool, +) -> bool: + if not instance_model.status.is_available() or instance_model.unreachable == unreachable: + return False + update_map["unreachable"] = unreachable + return True diff --git a/src/dstack/_internal/server/background/pipeline_tasks/instances/ssh_deploy.py b/src/dstack/_internal/server/background/pipeline_tasks/instances/ssh_deploy.py new file mode 100644 index 0000000000..b4e3e1122a --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/instances/ssh_deploy.py @@ -0,0 +1,302 @@ +import asyncio +from datetime import timedelta +from typing import Any, Optional + +from paramiko.pkey import PKey +from paramiko.ssh_exception import PasswordRequiredException +from pydantic import ValidationError + +from dstack._internal import settings +from dstack._internal.core.backends.base.compute import ( + GoArchType, + get_dstack_runner_binary_path, + get_dstack_shim_binary_path, + get_dstack_working_dir, + get_shim_env, + get_shim_pre_start_commands, +) +from dstack._internal.core.errors import SSHProvisioningError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceOfferWithAvailability, + InstanceRuntime, + InstanceStatus, + InstanceTerminationReason, + RemoteConnectionInfo, +) +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.server.background.pipeline_tasks.base import NOW_PLACEHOLDER +from dstack._internal.server.background.pipeline_tasks.instances.common import ( + PROVISIONING_TIMEOUT_SECONDS, + ProcessResult, + set_status_update, + ssh_keys_to_pkeys, +) +from dstack._internal.server.models import InstanceModel +from dstack._internal.server.schemas.instances import InstanceCheck +from dstack._internal.server.schemas.runner import HealthcheckResponse +from dstack._internal.server.services.instances import get_instance_remote_connection_info +from dstack._internal.server.services.logging import fmt +from dstack._internal.server.services.offers import is_divisible_into_blocks +from dstack._internal.server.services.runner import client as runner_client +from dstack._internal.server.services.ssh_fleets.provisioning import ( + detect_cpu_arch, + get_host_info, + get_paramiko_connection, + get_shim_healthcheck, + host_info_to_instance_type, + remove_dstack_runner_if_exists, + remove_host_info_if_exists, + run_pre_start_commands, + run_shim_as_systemd_service, + upload_envs, +) +from dstack._internal.utils.common import get_current_datetime, run_async +from dstack._internal.utils.logging import get_logger +from dstack._internal.utils.network import get_ip_from_network, is_ip_among_addresses + +logger = get_logger(__name__) + + +async def add_ssh_instance(instance_model: InstanceModel) -> ProcessResult: + result = ProcessResult() + logger.info("Adding ssh instance %s...", instance_model.name) + + retry_duration_deadline = instance_model.created_at + timedelta( + seconds=PROVISIONING_TIMEOUT_SECONDS + ) + if retry_duration_deadline < get_current_datetime(): + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.PROVISIONING_TIMEOUT, + termination_reason_message=( + f"Failed to add SSH instance in {PROVISIONING_TIMEOUT_SECONDS}s" + ), + ) + return result + + remote_details = get_instance_remote_connection_info(instance_model) + assert remote_details is not None + + try: + pkeys = ssh_keys_to_pkeys(remote_details.ssh_keys) + ssh_proxy_pkeys = None + if remote_details.ssh_proxy_keys is not None: + ssh_proxy_pkeys = ssh_keys_to_pkeys(remote_details.ssh_proxy_keys) + except (ValueError, PasswordRequiredException): + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message="Unsupported private SSH key type", + ) + return result + + authorized_keys = [pkey.public.strip() for pkey in remote_details.ssh_keys] + authorized_keys.append(instance_model.project.ssh_public_key.strip()) + + try: + future = run_async( + _deploy_instance, + remote_details, + pkeys, + ssh_proxy_pkeys, + authorized_keys, + ) + health, host_info, arch = await asyncio.wait_for(future, timeout=20 * 60) + except (asyncio.TimeoutError, TimeoutError) as exc: + logger.warning( + "%s: deploy timeout when adding SSH instance: %s", + fmt(instance_model), + repr(exc), + ) + return result + except SSHProvisioningError as exc: + logger.warning( + "%s: provisioning error when adding SSH instance: %s", + fmt(instance_model), + repr(exc), + ) + return result + except Exception: + logger.exception("%s: unexpected error when adding SSH instance", fmt(instance_model)) + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message="Unexpected error when adding SSH instance", + ) + return result + + instance_type = host_info_to_instance_type(host_info, arch) + try: + instance_network, internal_ip = _resolve_ssh_instance_network(instance_model, host_info) + except _SSHInstanceNetworkResolutionError as exc: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message=str(exc), + ) + return result + + divisible, blocks = is_divisible_into_blocks( + cpu_count=instance_type.resources.cpus, + gpu_count=len(instance_type.resources.gpus), + blocks="auto" if instance_model.total_blocks is None else instance_model.total_blocks, + ) + if not divisible: + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + termination_reason=InstanceTerminationReason.ERROR, + termination_reason_message="Cannot split into blocks", + ) + return result + + region = instance_model.region + assert region is not None + job_provisioning_data = JobProvisioningData( + backend=BackendType.REMOTE, + instance_type=instance_type, + instance_id="instance_id", + hostname=remote_details.host, + region=region, + price=0, + internal_ip=internal_ip, + instance_network=instance_network, + username=remote_details.ssh_user, + ssh_port=remote_details.port, + dockerized=True, + backend_data=None, + ssh_proxy=remote_details.ssh_proxy, + ) + instance_offer = InstanceOfferWithAvailability( + backend=BackendType.REMOTE, + instance=instance_type, + region=region, + price=0, + availability=InstanceAvailability.AVAILABLE, + instance_runtime=InstanceRuntime.SHIM, + ) + + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.IDLE if health else InstanceStatus.PROVISIONING, + ) + result.instance_update_map["backend"] = BackendType.REMOTE + result.instance_update_map["price"] = 0 + result.instance_update_map["offer"] = instance_offer.json() + result.instance_update_map["job_provisioning_data"] = job_provisioning_data.json() + result.instance_update_map["started_at"] = NOW_PLACEHOLDER + result.instance_update_map["total_blocks"] = blocks + return result + + +class _SSHInstanceNetworkResolutionError(Exception): + pass + + +def _resolve_ssh_instance_network( + instance_model: InstanceModel, + host_info: dict[str, Any], +) -> tuple[Optional[str], Optional[str]]: + instance_network = None + internal_ip = None + try: + default_job_provisioning_data = JobProvisioningData.__response__.parse_raw( + instance_model.job_provisioning_data + ) + instance_network = default_job_provisioning_data.instance_network + internal_ip = default_job_provisioning_data.internal_ip + except ValidationError: + pass + + host_network_addresses = host_info.get("addresses", []) + if internal_ip is None: + internal_ip = get_ip_from_network( + network=instance_network, + addresses=host_network_addresses, + ) + if instance_network is not None and internal_ip is None: + raise _SSHInstanceNetworkResolutionError( + "Failed to locate internal IP address on the given network" + ) + if internal_ip is not None and not is_ip_among_addresses( + ip_address=internal_ip, + addresses=host_network_addresses, + ): + raise _SSHInstanceNetworkResolutionError( + "Specified internal IP not found among instance interfaces" + ) + return instance_network, internal_ip + + +def _deploy_instance( + remote_details: RemoteConnectionInfo, + pkeys: list[PKey], + ssh_proxy_pkeys: Optional[list[PKey]], + authorized_keys: list[str], +) -> tuple[InstanceCheck, dict[str, Any], GoArchType]: + with get_paramiko_connection( + remote_details.ssh_user, + remote_details.host, + remote_details.port, + pkeys, + remote_details.ssh_proxy, + ssh_proxy_pkeys, + ) as client: + logger.debug("Connected to %s %s", remote_details.ssh_user, remote_details.host) + + arch = detect_cpu_arch(client) + logger.debug("%s: CPU arch is %s", remote_details.host, arch) + + # Execute pre start commands + shim_pre_start_commands = get_shim_pre_start_commands(arch=arch) + run_pre_start_commands(client, shim_pre_start_commands, authorized_keys) + logger.debug("The script for installing dstack has been executed") + + # Upload envs + shim_envs = get_shim_env(arch=arch) + try: + fleet_configuration_envs = remote_details.env.as_dict() + except ValueError as exc: + raise SSHProvisioningError(f"Invalid Env: {exc}") from exc + shim_envs.update(fleet_configuration_envs) + dstack_working_dir = get_dstack_working_dir() + dstack_shim_binary_path = get_dstack_shim_binary_path() + dstack_runner_binary_path = get_dstack_runner_binary_path() + upload_envs(client, dstack_working_dir, shim_envs) + logger.debug("The dstack-shim environment variables have been installed") + + # Ensure we have fresh versions of host info.json and dstack-runner + remove_host_info_if_exists(client, dstack_working_dir) + remove_dstack_runner_if_exists(client, dstack_runner_binary_path) + + # Run dstack-shim as a systemd service + run_shim_as_systemd_service( + client=client, + binary_path=dstack_shim_binary_path, + working_dir=dstack_working_dir, + dev=settings.DSTACK_VERSION is None, + ) + + # Get host info + host_info = get_host_info(client, dstack_working_dir) + logger.debug("Received a host_info %s", host_info) + + healthcheck_out = get_shim_healthcheck(client) + try: + healthcheck = HealthcheckResponse.__response__.parse_raw(healthcheck_out) + except ValueError as exc: + raise SSHProvisioningError(f"Cannot parse HealthcheckResponse: {exc}") from exc + instance_check = runner_client.healthcheck_response_to_instance_check(healthcheck) + return instance_check, host_info, arch diff --git a/src/dstack/_internal/server/background/pipeline_tasks/instances/termination.py b/src/dstack/_internal/server/background/pipeline_tasks/instances/termination.py new file mode 100644 index 0000000000..eb1f3c8a39 --- /dev/null +++ b/src/dstack/_internal/server/background/pipeline_tasks/instances/termination.py @@ -0,0 +1,88 @@ +from dstack._internal.core.errors import BackendError, NotYetTerminated +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.server.background.pipeline_tasks.base import NOW_PLACEHOLDER +from dstack._internal.server.background.pipeline_tasks.instances.common import ( + ProcessResult, + get_termination_deadline, + next_termination_retry_at, + set_status_update, +) +from dstack._internal.server.models import InstanceModel +from dstack._internal.server.services import backends as backends_services +from dstack._internal.server.services.instances import get_instance_provisioning_data +from dstack._internal.utils.common import get_current_datetime, run_async +from dstack._internal.utils.logging import get_logger + +logger = get_logger(__name__) + + +async def terminate_instance(instance_model: InstanceModel) -> ProcessResult: + result = ProcessResult() + now = get_current_datetime() + if ( + instance_model.last_termination_retry_at is not None + and next_termination_retry_at(instance_model.last_termination_retry_at) > now + ): + return result + + job_provisioning_data = get_instance_provisioning_data(instance_model) + if job_provisioning_data is not None and job_provisioning_data.backend != BackendType.REMOTE: + backend = await backends_services.get_project_backend_by_type( + project=instance_model.project, + backend_type=job_provisioning_data.backend, + ) + if backend is None: + logger.error( + "Failed to terminate instance %s. Backend %s not available.", + instance_model.name, + job_provisioning_data.backend, + ) + else: + logger.debug("Terminating runner instance %s", job_provisioning_data.hostname) + try: + await run_async( + backend.compute().terminate_instance, + job_provisioning_data.instance_id, + job_provisioning_data.region, + job_provisioning_data.backend_data, + ) + except Exception as exc: + first_retry_at = instance_model.first_termination_retry_at + if first_retry_at is None: + first_retry_at = now + result.instance_update_map["first_termination_retry_at"] = NOW_PLACEHOLDER + result.instance_update_map["last_termination_retry_at"] = NOW_PLACEHOLDER + if next_termination_retry_at(now) < get_termination_deadline(first_retry_at): + if isinstance(exc, NotYetTerminated): + logger.debug( + "Instance %s termination in progress: %s", + instance_model.name, + exc, + ) + else: + logger.warning( + "Failed to terminate instance %s. Will retry. Error: %r", + instance_model.name, + exc, + exc_info=not isinstance(exc, BackendError), + ) + return result + logger.error( + "Failed all attempts to terminate instance %s." + " Please terminate the instance manually to avoid unexpected charges." + " Error: %r", + instance_model.name, + exc, + exc_info=not isinstance(exc, BackendError), + ) + + result.instance_update_map["deleted"] = True + result.instance_update_map["deleted_at"] = NOW_PLACEHOLDER + result.instance_update_map["finished_at"] = NOW_PLACEHOLDER + set_status_update( + update_map=result.instance_update_map, + instance_model=instance_model, + new_status=InstanceStatus.TERMINATED, + ) + return result diff --git a/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py b/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py index 703cfe1548..552ae00dc8 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/placement_groups.py @@ -18,6 +18,8 @@ PipelineItem, UpdateMapDateTime, Worker, + log_lock_token_changed_after_processing, + log_lock_token_mismatch, resolve_now_placeholders, set_processed_update_map_fields, set_unlock_update_map_fields, @@ -189,12 +191,7 @@ async def process(self, item: PipelineItem): ) placement_group_model = res.unique().scalar_one_or_none() if placement_group_model is None: - logger.warning( - "Failed to process %s item %s: lock_token mismatch." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_mismatch(logger, item) return result = await _delete_placement_group(placement_group_model) @@ -217,12 +214,7 @@ async def process(self, item: PipelineItem): ) updated_ids = list(res.scalars().all()) if len(updated_ids) == 0: - logger.warning( - "Failed to update %s item %s after processing: lock_token changed." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_changed_after_processing(logger, item) class _PlacementGroupUpdateMap(ItemUpdateMap, total=False): diff --git a/src/dstack/_internal/server/background/pipeline_tasks/volumes.py b/src/dstack/_internal/server/background/pipeline_tasks/volumes.py index c7a8f5761a..81d94c361b 100644 --- a/src/dstack/_internal/server/background/pipeline_tasks/volumes.py +++ b/src/dstack/_internal/server/background/pipeline_tasks/volumes.py @@ -19,6 +19,8 @@ PipelineItem, UpdateMapDateTime, Worker, + log_lock_token_changed_after_processing, + log_lock_token_mismatch, resolve_now_placeholders, set_processed_update_map_fields, set_unlock_update_map_fields, @@ -204,8 +206,6 @@ async def process(self, item: VolumePipelineItem): await _process_to_be_deleted_item(item) elif item.status == VolumeStatus.SUBMITTED: await _process_submitted_item(item) - elif item.status == VolumeStatus.ACTIVE: - pass async def _process_submitted_item(item: VolumePipelineItem): @@ -227,12 +227,7 @@ async def _process_submitted_item(item: VolumePipelineItem): ) volume_model = res.unique().scalar_one_or_none() if volume_model is None: - logger.warning( - "Failed to process %s item %s: lock_token mismatch." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_mismatch(logger, item) return result = await _process_submitted_volume(volume_model) @@ -253,12 +248,7 @@ async def _process_submitted_item(item: VolumePipelineItem): ) updated_ids = list(res.scalars().all()) if len(updated_ids) == 0: - logger.warning( - "Failed to update %s item %s after processing: lock_token changed." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_changed_after_processing(logger, item) # TODO: Clean up volume. return emit_volume_status_change_event( @@ -369,12 +359,7 @@ async def _process_to_be_deleted_item(item: VolumePipelineItem): ) volume_model = res.unique().scalar_one_or_none() if volume_model is None: - logger.warning( - "Failed to process %s item %s: lock_token mismatch." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_mismatch(logger, item) return result = await _process_to_be_deleted_volume(volume_model) @@ -396,12 +381,7 @@ async def _process_to_be_deleted_item(item: VolumePipelineItem): ) updated_ids = list(res.scalars().all()) if len(updated_ids) == 0: - logger.warning( - "Failed to update %s item %s after processing: lock_token changed." - " The item is expected to be processed and updated on another fetch iteration.", - item.__tablename__, - item.id, - ) + log_lock_token_changed_after_processing(logger, item) return events.emit( session, diff --git a/src/dstack/_internal/server/background/scheduled_tasks/__init__.py b/src/dstack/_internal/server/background/scheduled_tasks/__init__.py index 9c7cd6ac1a..2994fca37c 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/__init__.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/__init__.py @@ -14,8 +14,10 @@ from dstack._internal.server.background.scheduled_tasks.idle_volumes import ( process_idle_volumes, ) +from dstack._internal.server.background.scheduled_tasks.instance_healthchecks import ( + delete_instance_healthchecks, +) from dstack._internal.server.background.scheduled_tasks.instances import ( - delete_instance_health_checks, process_instances, ) from dstack._internal.server.background.scheduled_tasks.metrics import ( @@ -93,16 +95,16 @@ def start_scheduled_tasks() -> AsyncIOScheduler: _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1) _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1) _scheduler.add_job(delete_events, IntervalTrigger(minutes=7), max_instances=1) + _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15)) + _scheduler.add_job( + process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1 + ) + _scheduler.add_job(delete_instance_healthchecks, IntervalTrigger(minutes=5), max_instances=1) if settings.ENABLE_PROMETHEUS_METRICS: _scheduler.add_job( collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1 ) _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1) - _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15)) - _scheduler.add_job( - process_idle_volumes, IntervalTrigger(seconds=60, jitter=10), max_instances=1 - ) - _scheduler.add_job(delete_instance_health_checks, IntervalTrigger(minutes=5), max_instances=1) if not FeatureFlags.PIPELINE_PROCESSING_ENABLED: _scheduler.add_job( process_fleets, @@ -144,13 +146,13 @@ def start_scheduled_tasks() -> AsyncIOScheduler: kwargs={"batch_size": 5}, max_instances=2 if replica == 0 else 1, ) - _scheduler.add_job( - process_instances, - IntervalTrigger(seconds=4, jitter=2), - kwargs={"batch_size": 5}, - max_instances=2 if replica == 0 else 1, - ) if not FeatureFlags.PIPELINE_PROCESSING_ENABLED: + _scheduler.add_job( + process_instances, + IntervalTrigger(seconds=4, jitter=2), + kwargs={"batch_size": 5}, + max_instances=2 if replica == 0 else 1, + ) _scheduler.add_job( process_compute_groups, IntervalTrigger(seconds=15, jitter=2), diff --git a/src/dstack/_internal/server/background/scheduled_tasks/instance_healthchecks.py b/src/dstack/_internal/server/background/scheduled_tasks/instance_healthchecks.py new file mode 100644 index 0000000000..41e83c71aa --- /dev/null +++ b/src/dstack/_internal/server/background/scheduled_tasks/instance_healthchecks.py @@ -0,0 +1,20 @@ +from datetime import timedelta + +from sqlalchemy import delete + +from dstack._internal.server import settings +from dstack._internal.server.db import get_session_ctx +from dstack._internal.server.models import InstanceHealthCheckModel +from dstack._internal.server.utils import sentry_utils +from dstack._internal.utils.common import get_current_datetime + + +@sentry_utils.instrument_scheduled_task +async def delete_instance_healthchecks(): + now = get_current_datetime() + cutoff = now - timedelta(seconds=settings.SERVER_INSTANCE_HEALTH_TTL_SECONDS) + async with get_session_ctx() as session: + await session.execute( + delete(InstanceHealthCheckModel).where(InstanceHealthCheckModel.collected_at < cutoff) + ) + await session.commit() diff --git a/src/dstack/_internal/server/background/scheduled_tasks/instances.py b/src/dstack/_internal/server/background/scheduled_tasks/instances.py index e5ecba5278..1857e0ad09 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/instances.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/instances.py @@ -9,7 +9,7 @@ from paramiko.pkey import PKey from paramiko.ssh_exception import PasswordRequiredException from pydantic import ValidationError -from sqlalchemy import and_, delete, func, not_, select +from sqlalchemy import and_, func, not_, select from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload @@ -33,12 +33,11 @@ BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT, ) from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT - -# FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute from dstack._internal.core.errors import ( BackendError, NotYetTerminated, ProvisioningError, + SSHProvisioningError, ) from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.fleets import InstanceGroupPlacement @@ -108,8 +107,7 @@ ) from dstack._internal.server.services.runner import client as runner_client from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel -from dstack._internal.server.utils import sentry_utils -from dstack._internal.server.utils.provisioning import ( +from dstack._internal.server.services.ssh_fleets.provisioning import ( detect_cpu_arch, get_host_info, get_paramiko_connection, @@ -121,6 +119,7 @@ run_shim_as_systemd_service, upload_envs, ) +from dstack._internal.server.utils import sentry_utils from dstack._internal.utils.common import ( get_current_datetime, get_or_error, @@ -152,17 +151,6 @@ async def process_instances(batch_size: int = 1): await asyncio.gather(*tasks) -@sentry_utils.instrument_scheduled_task -async def delete_instance_health_checks(): - now = get_current_datetime() - cutoff = now - timedelta(seconds=server_settings.SERVER_INSTANCE_HEALTH_TTL_SECONDS) - async with get_session_ctx() as session: - await session.execute( - delete(InstanceHealthCheckModel).where(InstanceHealthCheckModel.collected_at < cutoff) - ) - await session.commit() - - @sentry_utils.instrument_scheduled_task async def _process_next_instance(): lock, lockset = get_locker(get_db().dialect_name).get_lockset(InstanceModel.__tablename__) @@ -211,63 +199,81 @@ async def _process_next_instance(): async def _process_instance(session: AsyncSession, instance: InstanceModel): logger.debug("%s: processing instance, status: %s", fmt(instance), instance.status.upper()) - # Refetch to load related attributes. - # Load related attributes only for statuses that always need them. - if instance.status in ( - InstanceStatus.PENDING, - InstanceStatus.TERMINATING, - ): - res = await session.execute( - select(InstanceModel) - .where(InstanceModel.id == instance.id) - .options(joinedload(InstanceModel.project).joinedload(ProjectModel.backends)) - .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) - .options( - joinedload(InstanceModel.fleet).joinedload( - FleetModel.instances.and_(InstanceModel.deleted == False) - ), - ) - .execution_options(populate_existing=True) - ) - instance = res.unique().scalar_one() + if instance.status == InstanceStatus.PENDING: + await _process_pending_instance(session, instance) + elif instance.status == InstanceStatus.PROVISIONING: + await _process_provisioning_instance(session, instance) elif instance.status == InstanceStatus.IDLE: - res = await session.execute( - select(InstanceModel) - .where(InstanceModel.id == instance.id) - .options(joinedload(InstanceModel.project)) - .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) - .options( - joinedload(InstanceModel.fleet).joinedload( - FleetModel.instances.and_(InstanceModel.deleted == False) - ), + await _process_idle_instance(session, instance) + elif instance.status == InstanceStatus.BUSY: + await _process_busy_instance(session, instance) + elif instance.status == InstanceStatus.TERMINATING: + await _process_terminating_instance(session, instance) + + instance.last_processed_at = get_current_datetime() + await session.commit() + + +async def _process_pending_instance(session: AsyncSession, instance: InstanceModel): + instance = await _refetch_instance_for_pending_or_terminating(session, instance.id) + if is_ssh_instance(instance): + await _add_remote(session, instance) + else: + await _create_instance(session=session, instance=instance) + + +async def _process_provisioning_instance(session: AsyncSession, instance: InstanceModel): + await _check_instance(session, instance) + + +async def _process_idle_instance(session: AsyncSession, instance: InstanceModel): + instance = await _refetch_instance_for_idle(session, instance.id) + idle_duration_expired = _check_and_mark_terminating_if_idle_duration_expired(session, instance) + if not idle_duration_expired: + await _check_instance(session, instance) + + +async def _process_busy_instance(session: AsyncSession, instance: InstanceModel): + await _check_instance(session, instance) + + +async def _process_terminating_instance(session: AsyncSession, instance: InstanceModel): + instance = await _refetch_instance_for_pending_or_terminating(session, instance.id) + await _terminate(session, instance) + + +async def _refetch_instance_for_pending_or_terminating( + session: AsyncSession, instance_id +) -> InstanceModel: + res = await session.execute( + select(InstanceModel) + .where(InstanceModel.id == instance_id) + .options(joinedload(InstanceModel.project).joinedload(ProjectModel.backends)) + .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) + .options( + joinedload(InstanceModel.fleet).joinedload( + FleetModel.instances.and_(InstanceModel.deleted == False) ) - .execution_options(populate_existing=True) ) - instance = res.unique().scalar_one() + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one() - if instance.status == InstanceStatus.PENDING: - if is_ssh_instance(instance): - await _add_remote(session, instance) - else: - await _create_instance( - session=session, - instance=instance, + +async def _refetch_instance_for_idle(session: AsyncSession, instance_id) -> InstanceModel: + res = await session.execute( + select(InstanceModel) + .where(InstanceModel.id == instance_id) + .options(joinedload(InstanceModel.project)) + .options(joinedload(InstanceModel.jobs).load_only(JobModel.id, JobModel.status)) + .options( + joinedload(InstanceModel.fleet).joinedload( + FleetModel.instances.and_(InstanceModel.deleted == False) ) - elif instance.status in ( - InstanceStatus.PROVISIONING, - InstanceStatus.IDLE, - InstanceStatus.BUSY, - ): - idle_duration_expired = _check_and_mark_terminating_if_idle_duration_expired( - session, instance ) - if not idle_duration_expired: - await _check_instance(session, instance) - elif instance.status == InstanceStatus.TERMINATING: - await _terminate(session, instance) - - instance.last_processed_at = get_current_datetime() - await session.commit() + .execution_options(populate_existing=True) + ) + return res.unique().scalar_one() def _check_and_mark_terminating_if_idle_duration_expired( @@ -324,76 +330,61 @@ async def _add_remote(session: AsyncSession, instance: InstanceModel) -> None: switch_instance_status(session, instance, InstanceStatus.TERMINATED) return + remote_details = get_instance_remote_connection_info(instance) + assert remote_details is not None + try: - remote_details = get_instance_remote_connection_info(instance) - assert remote_details is not None - # Prepare connection key - try: - pkeys = _ssh_keys_to_pkeys(remote_details.ssh_keys) - if remote_details.ssh_proxy_keys is not None: - ssh_proxy_pkeys = _ssh_keys_to_pkeys(remote_details.ssh_proxy_keys) - else: - ssh_proxy_pkeys = None - except (ValueError, PasswordRequiredException): - instance.termination_reason = InstanceTerminationReason.ERROR - instance.termination_reason_message = "Unsupported private SSH key type" - switch_instance_status(session, instance, InstanceStatus.TERMINATED) - return - - authorized_keys = [pk.public.strip() for pk in remote_details.ssh_keys] - authorized_keys.append(instance.project.ssh_public_key.strip()) + pkeys = _ssh_keys_to_pkeys(remote_details.ssh_keys) + if remote_details.ssh_proxy_keys is not None: + ssh_proxy_pkeys = _ssh_keys_to_pkeys(remote_details.ssh_proxy_keys) + else: + ssh_proxy_pkeys = None + except (ValueError, PasswordRequiredException): + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = "Unsupported private SSH key type" + switch_instance_status(session, instance, InstanceStatus.TERMINATED) + return - try: - future = run_async( - _deploy_instance, remote_details, pkeys, ssh_proxy_pkeys, authorized_keys - ) - deploy_timeout = 20 * 60 # 20 minutes - result = await asyncio.wait_for(future, timeout=deploy_timeout) - health, host_info, arch = result - except (asyncio.TimeoutError, TimeoutError) as e: - raise ProvisioningError(f"Deploy timeout: {e}") from e - except Exception as e: - raise ProvisioningError(f"Deploy instance raised an error: {e}") from e - except ProvisioningError as e: + authorized_keys = [pk.public.strip() for pk in remote_details.ssh_keys] + authorized_keys.append(instance.project.ssh_public_key.strip()) + + try: + future = run_async( + _deploy_instance, remote_details, pkeys, ssh_proxy_pkeys, authorized_keys + ) + deploy_timeout = 20 * 60 # 20 minutes + health, host_info, arch = await asyncio.wait_for(future, timeout=deploy_timeout) + except (asyncio.TimeoutError, TimeoutError) as e: logger.warning( - "Provisioning instance %s could not be completed because of the error: %s", - instance.name, - e, + "%s: deploy timeout when adding SSH instance: %s", + fmt(instance), + repr(e), + ) + # Stays in PENDING, may retry later + return + except SSHProvisioningError as e: + logger.warning( + "%s: provisioning error when adding SSH instance: %s", + fmt(instance), + repr(e), ) # Stays in PENDING, may retry later return + except Exception: + logger.exception("%s: unexpected error when adding SSH instance", fmt(instance)) + instance.termination_reason = InstanceTerminationReason.ERROR + instance.termination_reason_message = "Unexpected error when adding SSH instance" + switch_instance_status(session, instance, InstanceStatus.TERMINATED) + return instance_type = host_info_to_instance_type(host_info, arch) - instance_network = None - internal_ip = None try: - default_jpd = JobProvisioningData.__response__.parse_raw(instance.job_provisioning_data) - instance_network = default_jpd.instance_network - internal_ip = default_jpd.internal_ip - except ValidationError: - pass - - host_network_addresses = host_info.get("addresses", []) - if internal_ip is None: - internal_ip = get_ip_from_network( - network=instance_network, - addresses=host_network_addresses, - ) - if instance_network is not None and internal_ip is None: + instance_network, internal_ip = _resolve_ssh_instance_network(instance, host_info) + except _SSHInstanceNetworkResolutionError as e: instance.termination_reason = InstanceTerminationReason.ERROR - instance.termination_reason_message = ( - "Failed to locate internal IP address on the given network" - ) + instance.termination_reason_message = str(e) switch_instance_status(session, instance, InstanceStatus.TERMINATED) return - if internal_ip is not None: - if not is_ip_among_addresses(ip_address=internal_ip, addresses=host_network_addresses): - instance.termination_reason = InstanceTerminationReason.ERROR - instance.termination_reason_message = ( - "Specified internal IP not found among instance interfaces" - ) - switch_instance_status(session, instance, InstanceStatus.TERMINATED) - return divisible, blocks = is_divisible_into_blocks( cpu_count=instance_type.resources.cpus, @@ -444,6 +435,41 @@ async def _add_remote(session: AsyncSession, instance: InstanceModel) -> None: instance.started_at = get_current_datetime() +class _SSHInstanceNetworkResolutionError(Exception): + pass + + +def _resolve_ssh_instance_network( + instance: InstanceModel, host_info: dict[str, Any] +) -> tuple[Optional[str], Optional[str]]: + instance_network = None + internal_ip = None + try: + default_jpd = JobProvisioningData.__response__.parse_raw(instance.job_provisioning_data) + instance_network = default_jpd.instance_network + internal_ip = default_jpd.internal_ip + except ValidationError: + pass + + host_network_addresses = host_info.get("addresses", []) + if internal_ip is None: + internal_ip = get_ip_from_network( + network=instance_network, + addresses=host_network_addresses, + ) + if instance_network is not None and internal_ip is None: + raise _SSHInstanceNetworkResolutionError( + "Failed to locate internal IP address on the given network" + ) + if internal_ip is not None and not is_ip_among_addresses( + ip_address=internal_ip, addresses=host_network_addresses + ): + raise _SSHInstanceNetworkResolutionError( + "Specified internal IP not found among instance interfaces" + ) + return instance_network, internal_ip + + def _deploy_instance( remote_details: RemoteConnectionInfo, pkeys: list[PKey], @@ -473,7 +499,7 @@ def _deploy_instance( try: fleet_configuration_envs = remote_details.env.as_dict() except ValueError as e: - raise ProvisioningError(f"Invalid Env: {e}") from e + raise SSHProvisioningError(f"Invalid Env: {e}") from e shim_envs.update(fleet_configuration_envs) dstack_working_dir = get_dstack_working_dir() dstack_shim_binary_path = get_dstack_shim_binary_path() @@ -501,7 +527,7 @@ def _deploy_instance( try: healthcheck = HealthcheckResponse.__response__.parse_raw(healthcheck_out) except ValueError as e: - raise ProvisioningError(f"Cannot parse HealthcheckResponse: {e}") from e + raise SSHProvisioningError(f"Cannot parse HealthcheckResponse: {e}") from e instance_check = runner_client.healthcheck_response_to_instance_check(healthcheck) return instance_check, host_info, arch @@ -646,6 +672,7 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No if instance.fleet and instance.id == master_instance.id and is_cloud_cluster(instance.fleet): # Do not attempt to deploy other instances, as they won't determine the correct cluster # backend, region, and placement group without a successfully deployed master instance + # FIXME: Race condition with siblings processed concurrently. for sibling_instance in instance.fleet.instances: if sibling_instance.id == instance.id: continue @@ -707,50 +734,22 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non switch_instance_status(session, instance, InstanceStatus.BUSY) return - ssh_private_keys = get_instance_ssh_private_keys(instance) - - health_check_cutoff = get_current_datetime() - timedelta( - seconds=server_settings.SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS - ) - res = await session.execute( - select(func.count(1)).where( - InstanceHealthCheckModel.instance_id == instance.id, - InstanceHealthCheckModel.collected_at > health_check_cutoff, - ) + check_instance_health = await _should_check_instance_health(session, instance) + instance_check = await _run_instance_check( + instance=instance, + job_provisioning_data=job_provisioning_data, + check_instance_health=check_instance_health, ) - check_instance_health = res.scalar_one() == 0 - - # May return False if fails to establish ssh connection - instance_check = await run_async( - _check_instance_inner, - ssh_private_keys, - job_provisioning_data, - None, + health_status = _get_health_status_for_instance_check( instance=instance, + instance_check=instance_check, check_instance_health=check_instance_health, ) - if instance_check is False: - instance_check = InstanceCheck(reachable=False, message="SSH or tunnel error") - - if instance_check.reachable and check_instance_health: - health_status = instance_check.get_health_status() - else: - # Keep previous health status - health_status = instance.health - - loglevel = logging.DEBUG - if not instance_check.reachable and instance.status.is_available(): - loglevel = logging.WARNING - elif check_instance_health and not health_status.is_healthy(): - loglevel = logging.WARNING - logger.log( - loglevel, - "Instance %s check: reachable=%s health_status=%s message=%r", - instance.name, - instance_check.reachable, - health_status.name, - instance_check.message, - extra={"instance_name": instance.name, "health_status": health_status}, + _log_instance_check_result( + instance=instance, + instance_check=instance_check, + health_status=health_status, + check_instance_health=check_instance_health, ) if instance_check.has_health_checks(): @@ -797,6 +796,73 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non switch_instance_status(session, instance, InstanceStatus.TERMINATING) +async def _should_check_instance_health(session: AsyncSession, instance: InstanceModel) -> bool: + health_check_cutoff = get_current_datetime() - timedelta( + seconds=server_settings.SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS + ) + result = await session.execute( + select(func.count(1)).where( + InstanceHealthCheckModel.instance_id == instance.id, + InstanceHealthCheckModel.collected_at > health_check_cutoff, + ) + ) + return result.scalar_one() == 0 + + +async def _run_instance_check( + instance: InstanceModel, + job_provisioning_data: JobProvisioningData, + check_instance_health: bool, +) -> InstanceCheck: + ssh_private_keys = get_instance_ssh_private_keys(instance) + + # May return False if fails to establish ssh connection + instance_check = await run_async( + _check_instance_inner, + ssh_private_keys, + job_provisioning_data, + None, + instance=instance, + check_instance_health=check_instance_health, + ) + if instance_check is False: + return InstanceCheck(reachable=False, message="SSH or tunnel error") + return instance_check + + +def _get_health_status_for_instance_check( + instance: InstanceModel, + instance_check: InstanceCheck, + check_instance_health: bool, +) -> HealthStatus: + if instance_check.reachable and check_instance_health: + return instance_check.get_health_status() + # Keep previous health status + return instance.health + + +def _log_instance_check_result( + instance: InstanceModel, + instance_check: InstanceCheck, + health_status: HealthStatus, + check_instance_health: bool, +) -> None: + loglevel = logging.DEBUG + if not instance_check.reachable and instance.status.is_available(): + loglevel = logging.WARNING + elif check_instance_health and not health_status.is_healthy(): + loglevel = logging.WARNING + logger.log( + loglevel, + "Instance %s check: reachable=%s health_status=%s message=%r", + instance.name, + instance_check.reachable, + health_status.name, + instance_check.message, + extra={"instance_name": instance.name, "health_status": health_status}, + ) + + async def _wait_for_instance_provisioning_data( session: AsyncSession, project: ProjectModel, @@ -1134,7 +1200,8 @@ def _get_termination_deadline(instance: InstanceModel) -> datetime.datetime: def _need_to_wait_fleet_provisioning( - instance: InstanceModel, master_instance: InstanceModel + instance: InstanceModel, + master_instance: InstanceModel, ) -> bool: # Cluster cloud instances should wait for the first fleet instance to be provisioned # so that they are provisioned in the same backend/region diff --git a/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py b/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py index 151f07deeb..729ded205c 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/submitted_jobs.py @@ -574,6 +574,10 @@ async def _fetch_fleet_with_master_instance_provisioning_data( fleet_model: Optional[FleetModel], job: Job, ) -> Optional[JobProvisioningData]: + # TODO: When submitted-jobs provisioning moves to pipelines, stop inferring the + # cluster master from loaded fleet instances here. Resolve the current master via + # FleetModel.current_master_instance_id so jobs follow the same master election + # as FleetPipeline/InstancePipeline. master_instance_provisioning_data = None if is_master_job(job) and fleet_model is not None: fleet = fleet_model_to_fleet(fleet_model) diff --git a/src/dstack/_internal/server/background/scheduled_tasks/terminating_jobs.py b/src/dstack/_internal/server/background/scheduled_tasks/terminating_jobs.py index 3749076c1a..27163b53d9 100644 --- a/src/dstack/_internal/server/background/scheduled_tasks/terminating_jobs.py +++ b/src/dstack/_internal/server/background/scheduled_tasks/terminating_jobs.py @@ -66,6 +66,7 @@ async def _process_next_terminating_job(): .where( InstanceModel.id == job_model.used_instance_id, InstanceModel.id.not_in(instance_lockset), + InstanceModel.lock_expires_at.is_(None), ) .with_for_update(skip_locked=True, key_share=True) ) diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_05_0547_8e8647f20aa4_add_instancemodel_pipeline_columns.py b/src/dstack/_internal/server/migrations/versions/2026/03_05_0547_8e8647f20aa4_add_instancemodel_pipeline_columns.py new file mode 100644 index 0000000000..f1c2b1217a --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_05_0547_8e8647f20aa4_add_instancemodel_pipeline_columns.py @@ -0,0 +1,47 @@ +"""Add InstanceModel pipeline columns + +Revision ID: 8e8647f20aa4 +Revises: 5e8c7a9202bc +Create Date: 2026-03-05 05:47:39.307013+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +import dstack._internal.server.models + +# revision identifiers, used by Alembic. +revision = "8e8647f20aa4" +down_revision = "5e8c7a9202bc" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "lock_expires_at", dstack._internal.server.models.NaiveDateTime(), nullable=True + ) + ) + batch_op.add_column( + sa.Column( + "lock_token", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True + ) + ) + batch_op.add_column(sa.Column("lock_owner", sa.String(length=100), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("instances", schema=None) as batch_op: + batch_op.drop_column("lock_owner") + batch_op.drop_column("lock_token") + batch_op.drop_column("lock_expires_at") + + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_05_0751_297c68450cc8_add_ix_instances_pipeline_fetch_q_index.py b/src/dstack/_internal/server/migrations/versions/2026/03_05_0751_297c68450cc8_add_ix_instances_pipeline_fetch_q_index.py new file mode 100644 index 0000000000..e629de0950 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_05_0751_297c68450cc8_add_ix_instances_pipeline_fetch_q_index.py @@ -0,0 +1,49 @@ +"""Add ix_instances_pipeline_fetch_q index + +Revision ID: 297c68450cc8 +Revises: 8e8647f20aa4 +Create Date: 2026-03-05 07:51:02.855596+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "297c68450cc8" +down_revision = "8e8647f20aa4" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_instances_pipeline_fetch_q", + table_name="instances", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_instances_pipeline_fetch_q", + "instances", + [sa.literal_column("last_processed_at ASC")], + unique=False, + sqlite_where=sa.text("deleted = 0"), + postgresql_where=sa.text("deleted IS FALSE"), + postgresql_concurrently=True, + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.get_context().autocommit_block(): + op.drop_index( + "ix_instances_pipeline_fetch_q", + table_name="instances", + if_exists=True, + postgresql_concurrently=True, + ) + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_05_1015_9cb8e4e4d986_add_fleet_current_master_instance.py b/src/dstack/_internal/server/migrations/versions/2026/03_05_1015_9cb8e4e4d986_add_fleet_current_master_instance.py new file mode 100644 index 0000000000..2049236267 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_05_1015_9cb8e4e4d986_add_fleet_current_master_instance.py @@ -0,0 +1,37 @@ +"""Add FleetModel current master instance + +Revision ID: 9cb8e4e4d986 +Revises: 297c68450cc8 +Create Date: 2026-03-05 10:15:00.000000+00:00 + +""" + +import sqlalchemy as sa +import sqlalchemy_utils +from alembic import op + +# revision identifiers, used by Alembic. +revision = "9cb8e4e4d986" +down_revision = "297c68450cc8" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "current_master_instance_id", + sqlalchemy_utils.types.uuid.UUIDType(binary=False), + nullable=True, + ) + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("fleets", schema=None) as batch_op: + batch_op.drop_column("current_master_instance_id") + # ### end Alembic commands ### diff --git a/src/dstack/_internal/server/migrations/versions/2026/03_05_1045_c7b0a8e57294_add_ix_fleets_current_master_instance_id.py b/src/dstack/_internal/server/migrations/versions/2026/03_05_1045_c7b0a8e57294_add_ix_fleets_current_master_instance_id.py new file mode 100644 index 0000000000..e1cb938750 --- /dev/null +++ b/src/dstack/_internal/server/migrations/versions/2026/03_05_1045_c7b0a8e57294_add_ix_fleets_current_master_instance_id.py @@ -0,0 +1,42 @@ +"""Add ix_fleets_current_master_instance_id index + +Revision ID: c7b0a8e57294 +Revises: 9cb8e4e4d986 +Create Date: 2026-03-05 10:45:00.000000+00:00 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "c7b0a8e57294" +down_revision = "9cb8e4e4d986" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + with op.get_context().autocommit_block(): + op.drop_index( + "ix_fleets_current_master_instance_id", + table_name="fleets", + if_exists=True, + postgresql_concurrently=True, + ) + op.create_index( + "ix_fleets_current_master_instance_id", + "fleets", + ["current_master_instance_id"], + unique=False, + postgresql_concurrently=True, + ) + + +def downgrade() -> None: + with op.get_context().autocommit_block(): + op.drop_index( + "ix_fleets_current_master_instance_id", + table_name="fleets", + if_exists=True, + postgresql_concurrently=True, + ) diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index 15801a25df..d1a30b941b 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -602,7 +602,14 @@ class FleetModel(PipelineModelMixin, BaseModel): runs: Mapped[List["RunModel"]] = relationship(back_populates="fleet") jobs: Mapped[List["JobModel"]] = relationship(back_populates="fleet") - instances: Mapped[List["InstanceModel"]] = relationship(back_populates="fleet") + instances: Mapped[List["InstanceModel"]] = relationship( + back_populates="fleet", + foreign_keys="InstanceModel.fleet_id", + ) + + current_master_instance_id: Mapped[Optional[uuid.UUID]] = mapped_column( + UUIDType(binary=False), index=True + ) # `consolidation_attempt` counts how many times in a row fleet needed consolidation. # Allows increasing delays between attempts. @@ -619,7 +626,7 @@ class FleetModel(PipelineModelMixin, BaseModel): ) -class InstanceModel(BaseModel): +class InstanceModel(PipelineModelMixin, BaseModel): __tablename__ = "instances" id: Mapped[uuid.UUID] = mapped_column( @@ -647,7 +654,10 @@ class InstanceModel(BaseModel): pool: Mapped[Optional["PoolModel"]] = relationship(back_populates="instances") fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id"), index=True) - fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="instances") + fleet: Mapped[Optional["FleetModel"]] = relationship( + back_populates="instances", + foreign_keys=[fleet_id], + ) compute_group_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("compute_groups.id")) compute_group: Mapped[Optional["ComputeGroupModel"]] = relationship(back_populates="instances") @@ -727,6 +737,15 @@ class InstanceModel(BaseModel): cascade="save-update, merge, delete-orphan, delete", ) + __table_args__ = ( + Index( + "ix_instances_pipeline_fetch_q", + last_processed_at.asc(), + postgresql_where=deleted == false(), + sqlite_where=deleted == false(), + ), + ) + class InstanceHealthCheckModel(BaseModel): __tablename__ = "instance_health_checks" diff --git a/src/dstack/_internal/server/routers/fleets.py b/src/dstack/_internal/server/routers/fleets.py index cb18db8bbd..58c87d653b 100644 --- a/src/dstack/_internal/server/routers/fleets.py +++ b/src/dstack/_internal/server/routers/fleets.py @@ -26,6 +26,7 @@ ProjectMember, check_can_access_fleet, ) +from dstack._internal.server.services.pipelines import PipelineHinterProtocol, get_pipeline_hinter from dstack._internal.server.utils.routers import ( CustomORJSONResponse, get_base_api_additional_responses, @@ -144,6 +145,7 @@ async def apply_plan( body: ApplyFleetPlanRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), + pipeline_hinter: PipelineHinterProtocol = Depends(get_pipeline_hinter), ): """ Creates a new fleet or updates an existing fleet. @@ -158,6 +160,7 @@ async def apply_plan( project=project, plan=body.plan, force=body.force, + pipeline_hinter=pipeline_hinter, ) ) @@ -167,6 +170,7 @@ async def create_fleet( body: CreateFleetRequest, session: AsyncSession = Depends(get_session), user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()), + pipeline_hinter: PipelineHinterProtocol = Depends(get_pipeline_hinter), ): """ Creates a fleet given a fleet configuration. @@ -178,6 +182,7 @@ async def create_fleet( project=project, user=user, spec=body.spec, + pipeline_hinter=pipeline_hinter, ) ) diff --git a/src/dstack/_internal/server/services/fleets.py b/src/dstack/_internal/server/services/fleets.py index ca5a2e7b4f..183e81b208 100644 --- a/src/dstack/_internal/server/services/fleets.py +++ b/src/dstack/_internal/server/services/fleets.py @@ -1,3 +1,4 @@ +import asyncio import uuid from collections.abc import Callable from datetime import datetime @@ -51,7 +52,7 @@ from dstack._internal.core.models.users import GlobalRole from dstack._internal.core.services import validate_dstack_resource_name from dstack._internal.core.services.diff import ModelDiff, copy_model, diff_models -from dstack._internal.server.db import get_db, is_db_postgres, is_db_sqlite +from dstack._internal.server.db import get_db, is_db_postgres, is_db_sqlite, sqlite_commit from dstack._internal.server.models import ( ExportedFleetModel, FleetModel, @@ -75,6 +76,7 @@ get_locker, string_to_lock_id, ) +from dstack._internal.server.services.pipelines import PipelineHinterProtocol from dstack._internal.server.services.plugins import apply_plugin_policies from dstack._internal.server.services.projects import ( get_member, @@ -84,7 +86,12 @@ ) from dstack._internal.server.services.resources import set_resources_defaults from dstack._internal.utils import random_names -from dstack._internal.utils.common import EntityID, EntityName, EntityNameOrID +from dstack._internal.utils.common import ( + EntityID, + EntityName, + EntityNameOrID, + get_current_datetime, +) from dstack._internal.utils.logging import get_logger from dstack._internal.utils.ssh import pkey_from_str @@ -465,19 +472,26 @@ async def get_create_instance_offers( fleet_model: Optional[FleetModel] = None, blocks: Union[int, Literal["auto"]] = 1, exclude_not_available: bool = False, + master_job_provisioning_data: Optional[JobProvisioningData] = None, + infer_master_job_provisioning_data_from_fleet_instances: bool = True, ) -> List[Tuple[Backend, InstanceOfferWithAvailability]]: multinode = False - master_job_provisioning_data = None if fleet_spec is not None: multinode = fleet_spec.configuration.placement == InstanceGroupPlacement.CLUSTER if fleet_model is not None: - fleet = fleet_model_to_fleet(fleet_model) - multinode = fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER - for instance in fleet_model.instances: - jpd = instances_services.get_instance_provisioning_data(instance) - if jpd is not None: - master_job_provisioning_data = jpd - break + fleet_spec_from_model = get_fleet_spec(fleet_model) + multinode = fleet_spec_from_model.configuration.placement == InstanceGroupPlacement.CLUSTER + # The caller may override the current cluster master explicitly instead + # of inferring placement restrictions from the loaded fleet instances. + if ( + master_job_provisioning_data is None + and infer_master_job_provisioning_data_from_fleet_instances + ): + for instance in fleet_model.instances: + jpd = instances_services.get_instance_provisioning_data(instance) + if jpd is not None: + master_job_provisioning_data = jpd + break offers = await offers_services.get_offers_by_requirements( project=project, @@ -503,6 +517,7 @@ async def apply_plan( project: ProjectModel, plan: ApplyFleetPlanInput, force: bool, + pipeline_hinter: PipelineHinterProtocol, ) -> Fleet: spec = await apply_plugin_policies( user=user.name, @@ -523,6 +538,7 @@ async def apply_plan( project=project, user=user, spec=spec, + pipeline_hinter=pipeline_hinter, ) fleet_model = await get_project_fleet_model_by_name( @@ -536,6 +552,7 @@ async def apply_plan( project=project, user=user, spec=spec, + pipeline_hinter=pipeline_hinter, ) instances_ids = sorted(i.id for i in fleet_model.instances if not i.deleted) @@ -546,6 +563,8 @@ async def apply_plan( ): # Refetch after lock # TODO: Lock instances with FOR UPDATE? + # We do not respect InstanceModel.lock_* fields here because FleetPipeline does not update SSH instances. + # TODO: Respect InstanceModel.lock_* fields if FleetPipeline and apply update the same instances. res = await session.execute( select(FleetModel) .where( @@ -591,6 +610,7 @@ async def apply_plan( project=project, user=user, spec=spec, + pipeline_hinter=pipeline_hinter, ) @@ -599,6 +619,7 @@ async def create_fleet( project: ProjectModel, user: UserModel, spec: FleetSpec, + pipeline_hinter: PipelineHinterProtocol, ) -> Fleet: spec = await apply_plugin_policies( user=user.name, @@ -612,7 +633,9 @@ async def create_fleet( if spec.configuration.ssh_config is not None: _check_can_manage_ssh_fleets(user=user, project=project) - return await _create_fleet(session=session, project=project, user=user, spec=spec) + return await _create_fleet( + session=session, project=project, user=user, spec=spec, pipeline_hinter=pipeline_hinter + ) def create_fleet_instance_model( @@ -621,6 +644,7 @@ def create_fleet_instance_model( username: str, spec: FleetSpec, instance_num: int, + instance_id: Optional[uuid.UUID] = None, ) -> InstanceModel: profile = spec.merged_profile requirements = get_fleet_requirements(spec) @@ -632,6 +656,7 @@ def create_fleet_instance_model( requirements=requirements, instance_name=f"{spec.configuration.name}-{instance_num}", instance_num=instance_num, + instance_id=instance_id, reservation=spec.merged_profile.reservation, blocks=spec.configuration.blocks, tags=spec.configuration.tags, @@ -716,7 +741,7 @@ async def delete_fleets( .order_by(FleetModel.id) ) fleets_ids = list(res.scalars().unique().all()) - res = await session.execute( + stmt = ( select(InstanceModel.id) .where( InstanceModel.fleet_id.in_(fleets_ids), @@ -724,60 +749,73 @@ async def delete_fleets( ) .order_by(InstanceModel.id) ) + if instance_nums is not None: + stmt = stmt.where(InstanceModel.instance_num.in_(instance_nums)) + res = await session.execute(stmt) instances_ids = list(res.scalars().unique().all()) - if is_db_sqlite(): - # Start new transaction to see committed changes after lock - await session.commit() + await sqlite_commit(session) async with ( get_locker(get_db().dialect_name).lock_ctx(FleetModel.__tablename__, fleets_ids), get_locker(get_db().dialect_name).lock_ctx(InstanceModel.__tablename__, instances_ids), ): - # Refetch after lock. - # TODO: Do not lock fleet when deleting only instances. - res = await session.execute( - select(FleetModel) - .where( - FleetModel.project_id == project.id, - FleetModel.id.in_(fleets_ids), - FleetModel.deleted == False, - FleetModel.lock_expires_at.is_(None), - ) - .options( - selectinload(FleetModel.instances.and_(InstanceModel.id.in_(instances_ids))) - .selectinload(InstanceModel.jobs) - .load_only(JobModel.id) - ) - .options( - selectinload( - FleetModel.runs.and_(RunModel.status.not_in(RunStatus.finished_statuses())) - ).load_only(RunModel.status) + # Retry locking fleets to increase lock acquisition chances. + # This hack is needed until requests are queued. + fleet_models = [] + for i in range(10): + res = await session.execute( + select(FleetModel) + .where( + FleetModel.project_id == project.id, + FleetModel.id.in_(fleets_ids), + FleetModel.deleted == False, + FleetModel.lock_expires_at.is_(None), + ) + .options( + selectinload(FleetModel.instances.and_(InstanceModel.id.in_(instances_ids))) + .selectinload(InstanceModel.jobs) + .load_only(JobModel.id) + ) + .options( + selectinload( + FleetModel.runs.and_(RunModel.status.not_in(RunStatus.finished_statuses())) + ).load_only(RunModel.status) + ) + .order_by(FleetModel.id) # take locks in order + .with_for_update(key_share=True, of=FleetModel) + .execution_options(populate_existing=True) ) - .execution_options(populate_existing=True) - .order_by(FleetModel.id) # take locks in order - .with_for_update(key_share=True, of=FleetModel) - ) - fleet_models = res.scalars().unique().all() + fleet_models = res.scalars().unique().all() + if len(fleet_models) == len(fleets_ids): + break + await asyncio.sleep(0.5) if len(fleet_models) != len(fleets_ids): - # TODO: Make the endpoint fully async so we don't need to lock and error: - # put the request in queue and process in the background. + # TODO: Make the endpoint fully async so we don't need to lock and error. msg = ( "Failed to delete fleets: fleets are being processed currently. Try again later." if instance_nums is None else "Failed to delete fleet instances: fleets are being processed currently. Try again later." ) raise ServerClientError(msg) - res = await session.execute( - select(InstanceModel.id) - .where( - InstanceModel.id.in_(instances_ids), - InstanceModel.deleted == False, + # Retry locking instances to increase lock acquisition chances. + # This hack is needed until requests are queued. + instances_left_to_lock = set(instances_ids) + for i in range(10): + res = await session.execute( + select(InstanceModel.id) + .where( + InstanceModel.id.in_(instances_left_to_lock), + InstanceModel.deleted == False, + InstanceModel.lock_expires_at.is_(None), + ) + .order_by(InstanceModel.id) # take locks in order + .with_for_update(key_share=True, of=InstanceModel) + .execution_options(populate_existing=True) ) - .order_by(InstanceModel.id) # take locks in order - .with_for_update(key_share=True, of=InstanceModel) - .execution_options(populate_existing=True) - ) - instance_models_ids = list(res.scalars().unique().all()) - if len(instance_models_ids) != len(instances_ids): + instances_left_to_lock.difference_update(res.scalars().unique().all()) + if len(instances_left_to_lock) == 0: + break + await asyncio.sleep(0.5) + if len(instances_left_to_lock) > 0: msg = ( "Failed to delete fleets: fleet instances are being processed currently. Try again later." if instance_nums is None @@ -785,8 +823,8 @@ async def delete_fleets( ) raise ServerClientError(msg) for fleet_model in fleet_models: - fleet = fleet_model_to_fleet(fleet_model) - if fleet.spec.configuration.ssh_config is not None: + fleet_spec = get_fleet_spec(fleet_model) + if fleet_spec.configuration.ssh_config is not None: _check_can_manage_ssh_fleets(user=user, project=project) if instance_nums is None: logger.info("Deleting fleets: %s", [f.name for f in fleet_models]) @@ -867,10 +905,10 @@ def is_fleet_empty(fleet_model: FleetModel) -> bool: def is_cloud_cluster(fleet_model: FleetModel) -> bool: - fleet = fleet_model_to_fleet(fleet_model) + fleet_spec = get_fleet_spec(fleet_model) return ( - fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER - and fleet.spec.configuration.ssh_config is None + fleet_spec.configuration.placement == InstanceGroupPlacement.CLUSTER + and fleet_spec.configuration.ssh_config is None ) @@ -905,6 +943,9 @@ def get_fleet_master_instance_provisioning_data( ) -> Optional[JobProvisioningData]: master_instance_provisioning_data = None if fleet_spec.configuration.placement == InstanceGroupPlacement.CLUSTER: + # TODO: This legacy helper infers the cluster master from fleet instances. + # Pipeline-based provisioning should use FleetModel.current_master_instance_id + # instead of relying on instance ordering in the loaded relationship. # Offers for master jobs must be in the same cluster as existing instances. fleet_instance_models = [im for im in fleet_model.instances if not im.deleted] if len(fleet_instance_models) > 0: @@ -940,6 +981,7 @@ async def _create_fleet( project: ProjectModel, user: UserModel, spec: FleetSpec, + pipeline_hinter: PipelineHinterProtocol, ) -> Fleet: lock_namespace = f"fleet_names_{project.name}" if is_db_sqlite(): @@ -962,6 +1004,7 @@ async def _create_fleet( else: spec.configuration.name = await generate_fleet_name(session=session, project=project) + now = get_current_datetime() fleet_model = FleetModel( id=uuid.uuid4(), name=spec.configuration.name, @@ -969,6 +1012,8 @@ async def _create_fleet( status=FleetStatus.ACTIVE, spec=spec.json(), instances=[], + created_at=now, + last_processed_at=now, ) session.add(fleet_model) events.emit( @@ -1021,6 +1066,9 @@ async def _create_fleet( ) fleet_model.instances.append(instance_model) await session.commit() + if spec.configuration.ssh_config is None: + pipeline_hinter.hint_fetch(FleetModel.__name__) + pipeline_hinter.hint_fetch(InstanceModel.__name__) return fleet_model_to_fleet(fleet_model) diff --git a/src/dstack/_internal/server/services/gateways/__init__.py b/src/dstack/_internal/server/services/gateways/__init__.py index ddc3d64c44..b4dfef083f 100644 --- a/src/dstack/_internal/server/services/gateways/__init__.py +++ b/src/dstack/_internal/server/services/gateways/__init__.py @@ -341,23 +341,28 @@ async def _delete_gateways_pipeline( async with get_locker(get_db().dialect_name).lock_ctx( GatewayModel.__tablename__, gateways_ids ): - # Refetch after lock - res = await session.execute( - select(GatewayModel) - .where( - GatewayModel.id.in_(gateways_ids), - GatewayModel.project_id == project.id, - GatewayModel.lock_expires_at.is_(None), + # Retry locking gateways to increase lock acquisition chances. + # This hack is needed until requests are queued. + gateway_models = [] + for i in range(10): + res = await session.execute( + select(GatewayModel) + .where( + GatewayModel.id.in_(gateways_ids), + GatewayModel.project_id == project.id, + GatewayModel.lock_expires_at.is_(None), + ) + .options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) + .order_by(GatewayModel.id) # take locks in order + .with_for_update(key_share=True, of=GatewayModel) + .execution_options(populate_existing=True) ) - .options(joinedload(GatewayModel.backend).load_only(BackendModel.type)) - .order_by(GatewayModel.id) # take locks in order - .with_for_update(key_share=True, nowait=True, of=GatewayModel) - .execution_options(populate_existing=True) - ) - gateway_models = res.scalars().all() + gateway_models = res.scalars().all() + if len(gateway_models) == len(gateways_ids): + break + await asyncio.sleep(0.5) if len(gateway_models) != len(gateways_ids): - # TODO: Make the endpoint fully async so we don't need to lock and error: - # put the request in queue and process in the background. + # TODO: Make the endpoint fully async so we don't need to lock and error. raise ServerClientError( "Failed to delete gateways: gateways are being processed currently. Try again later." ) diff --git a/src/dstack/_internal/server/services/instances.py b/src/dstack/_internal/server/services/instances.py index 079faf90c7..e07bce938b 100644 --- a/src/dstack/_internal/server/services/instances.py +++ b/src/dstack/_internal/server/services/instances.py @@ -90,6 +90,8 @@ def switch_instance_status( instance_model=instance_model, old_status=old_status, new_status=new_status, + termination_reason=instance_model.termination_reason, + termination_reason_message=instance_model.termination_reason_message, actor=actor, ) @@ -99,20 +101,26 @@ def emit_instance_status_change_event( instance_model: InstanceModel, old_status: InstanceStatus, new_status: InstanceStatus, + termination_reason: Optional[InstanceTerminationReason], + termination_reason_message: Optional[str], actor: events.AnyActor = events.SystemActor(), ) -> None: if old_status == new_status: return msg = get_instance_status_change_message( - instance_model=instance_model, old_status=old_status, new_status=new_status, + termination_reason=termination_reason, + termination_reason_message=termination_reason_message, ) events.emit(session, msg, actor=actor, targets=[events.Target.from_model(instance_model)]) def get_instance_status_change_message( - instance_model: InstanceModel, old_status: InstanceStatus, new_status: InstanceStatus + old_status: InstanceStatus, + new_status: InstanceStatus, + termination_reason: Optional[InstanceTerminationReason], + termination_reason_message: Optional[str], ) -> str: msg = f"Instance status changed {old_status.upper()} -> {new_status.upper()}" if ( @@ -120,20 +128,20 @@ def get_instance_status_change_message( or new_status == InstanceStatus.TERMINATED and old_status != InstanceStatus.TERMINATING ): - if instance_model.termination_reason is None: + if termination_reason is None: raise ValueError( f"termination_reason must be set when switching to {new_status.upper()} status" ) if ( - instance_model.termination_reason == InstanceTerminationReason.ERROR - and not instance_model.termination_reason_message + termination_reason == InstanceTerminationReason.ERROR + and not termination_reason_message ): raise ValueError( "termination_reason_message must be set when termination_reason is ERROR" ) - msg += f". Termination reason: {instance_model.termination_reason.upper()}" - if instance_model.termination_reason_message: - msg += f" ({instance_model.termination_reason_message})" + msg += f". Termination reason: {termination_reason.upper()}" + if termination_reason_message: + msg += f" ({termination_reason_message})" return msg @@ -651,11 +659,13 @@ def create_instance_model( reservation: Optional[str], blocks: Union[Literal["auto"], int], tags: Optional[Dict[str, str]], + instance_id: Optional[uuid.UUID] = None, ) -> InstanceModel: termination_policy, termination_idle_time = get_termination( profile, DEFAULT_FLEET_TERMINATION_IDLE_TIME ) - instance_id = uuid.uuid4() + if instance_id is None: + instance_id = uuid.uuid4() project_ssh_key = SSHKey( public=project.ssh_public_key.strip(), private=project.ssh_private_key.strip(), @@ -669,12 +679,14 @@ def create_instance_model( reservation=reservation, tags=tags, ) + now = common_utils.get_current_datetime() instance = InstanceModel( id=instance_id, name=instance_name, instance_num=instance_num, project=project, - created_at=common_utils.get_current_datetime(), + created_at=now, + last_processed_at=now, status=InstanceStatus.PENDING, unreachable=False, profile=profile.json(), diff --git a/src/dstack/_internal/server/services/runs/plan.py b/src/dstack/_internal/server/services/runs/plan.py index 4738622a07..9694cccd71 100644 --- a/src/dstack/_internal/server/services/runs/plan.py +++ b/src/dstack/_internal/server/services/runs/plan.py @@ -259,9 +259,12 @@ async def select_run_candidate_fleet_models_with_filters( .execution_options(populate_existing=True) ) if lock_instances: - stmt = stmt.order_by(InstanceModel.id).with_for_update( # take locks in order - key_share=True, of=InstanceModel - ) + # Skip locked instances since waiting for all the instances to unlock may take indefinite time. + # TODO: Switch to optimistic locking – implement select-lock-reselect loop. + stmt = stmt.where(InstanceModel.lock_expires_at.is_(None)) + stmt = stmt.order_by( + InstanceModel.id # take locks in order + ).with_for_update(skip_locked=True, key_share=True, of=InstanceModel) res = await session.execute(stmt) fleet_models_with_instances = list(res.unique().scalars().all()) fleet_models_with_instances_ids = [f.id for f in fleet_models_with_instances] diff --git a/src/dstack/_internal/server/services/ssh_fleets/__init__.py b/src/dstack/_internal/server/services/ssh_fleets/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/dstack/_internal/server/utils/provisioning.py b/src/dstack/_internal/server/services/ssh_fleets/provisioning.py similarity index 86% rename from src/dstack/_internal/server/utils/provisioning.py rename to src/dstack/_internal/server/services/ssh_fleets/provisioning.py index fcbe3bf086..3a7c21e6dd 100644 --- a/src/dstack/_internal/server/utils/provisioning.py +++ b/src/dstack/_internal/server/services/ssh_fleets/provisioning.py @@ -14,9 +14,7 @@ normalize_arch, ) from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT - -# FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute -from dstack._internal.core.errors import ProvisioningError +from dstack._internal.core.errors import SSHProvisioningError from dstack._internal.core.models.instances import ( Disk, Gpu, @@ -46,15 +44,15 @@ def detect_cpu_arch(client: paramiko.SSHClient) -> GoArchType: try: _, stdout, stderr = client.exec_command(cmd, timeout=20) except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"detect_cpu_arch: {e}") from e + raise SSHProvisioningError(f"detect_cpu_arch: {e}") from e out = stdout.read().strip().decode() err = stderr.read().strip().decode() if err: - raise ProvisioningError(f"detect_cpu_arch: {cmd} failed, stdout: {out}, stderr: {err}") + raise SSHProvisioningError(f"detect_cpu_arch: {cmd} failed, stdout: {out}, stderr: {err}") try: return normalize_arch(out) except ValueError as e: - raise ProvisioningError(f"detect_cpu_arch: failed to normalize arch: {e}") from e + raise SSHProvisioningError(f"detect_cpu_arch: failed to normalize arch: {e}") from e def sftp_upload(client: paramiko.SSHClient, path: str, body: str) -> None: @@ -66,7 +64,7 @@ def sftp_upload(client: paramiko.SSHClient, path: str, body: str) -> None: sftp.putfo(io.BytesIO(body.encode()), path) sftp.close() except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"sft_upload failed: {e}") from e + raise SSHProvisioningError(f"sft_upload failed: {e}") from e def upload_envs(client: paramiko.SSHClient, working_dir: str, envs: Dict[str, str]) -> None: @@ -80,11 +78,11 @@ def upload_envs(client: paramiko.SSHClient, working_dir: str, envs: Dict[str, st out = stdout.read().strip().decode() err = stderr.read().strip().decode() if out or err: - raise ProvisioningError( + raise SSHProvisioningError( f"The command 'upload_envs' didn't work. stdout: {out}, stderr: {err}" ) except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"upload_envs failed: {e}") from e + raise SSHProvisioningError(f"upload_envs failed: {e}") from e def run_pre_start_commands( @@ -98,11 +96,11 @@ def run_pre_start_commands( out = stdout.read().strip().decode() err = stderr.read().strip().decode() if out or err: - raise ProvisioningError( + raise SSHProvisioningError( f"The command 'authorized_keys' didn't work. stdout: {out}, stderr: {err}" ) except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"upload authorized_keys failed: {e}") from e + raise SSHProvisioningError(f"upload authorized_keys failed: {e}") from e script = " && ".join(shim_pre_start_commands) try: @@ -110,11 +108,11 @@ def run_pre_start_commands( out = stdout.read().strip().decode() err = stderr.read().strip().decode() if out or err: - raise ProvisioningError( + raise SSHProvisioningError( f"The command 'run_pre_start_commands' didn't work. stdout: {out}, stderr: {err}" ) except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"run_pre-start_commands failed: {e}") from e + raise SSHProvisioningError(f"run_pre-start_commands failed: {e}") from e def run_shim_as_systemd_service( @@ -158,11 +156,11 @@ def run_shim_as_systemd_service( out = stdout.read().strip().decode() err = stderr.read().strip().decode() if out or err: - raise ProvisioningError( + raise SSHProvisioningError( f"The command 'run_shim_as_systemd_service' didn't work. stdout: {out}, stderr: {err}" ) except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"run_shim_as_systemd failed: {e}") from e + raise SSHProvisioningError(f"run_shim_as_systemd failed: {e}") from e def check_dstack_shim_service(client: paramiko.SSHClient): @@ -170,12 +168,12 @@ def check_dstack_shim_service(client: paramiko.SSHClient): _, stdout, _ = client.exec_command("sudo systemctl status dstack-shim.service", timeout=10) status = stdout.read() except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"Checking dstack-shim.service status failed: {e}") from e + raise SSHProvisioningError(f"Checking dstack-shim.service status failed: {e}") from e for raw_line in status.splitlines(): line = raw_line.decode() if line.strip().startswith("Active: failed"): - raise ProvisioningError(f"The dstack-shim service doesn't start: {line.strip()}") + raise SSHProvisioningError(f"The dstack-shim service doesn't start: {line.strip()}") def remove_host_info_if_exists(client: paramiko.SSHClient, working_dir: str) -> None: @@ -188,7 +186,7 @@ def remove_host_info_if_exists(client: paramiko.SSHClient, working_dir: str) -> if err: logger.debug(f"{HOST_INFO_FILE} hasn't been removed: %s", err) except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"remove_host_info_if_exists failed: {e}") + raise SSHProvisioningError(f"remove_host_info_if_exists failed: {e}") def remove_dstack_runner_if_exists(client: paramiko.SSHClient, path: str) -> None: @@ -198,7 +196,7 @@ def remove_dstack_runner_if_exists(client: paramiko.SSHClient, path: str) -> Non if err: logger.debug(f"{path} hasn't been removed: %s", err) except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"remove_dstack_runner_if_exists failed: {e}") + raise SSHProvisioningError(f"remove_dstack_runner_if_exists failed: {e}") def get_host_info(client: paramiko.SSHClient, working_dir: str) -> Dict[str, Any]: @@ -224,11 +222,11 @@ def get_host_info(client: paramiko.SSHClient, working_dir: str) -> Dict[str, Any return host_info except ValueError: # JSON parse error check_dstack_shim_service(client) - raise ProvisioningError("Cannot parse host_info") + raise SSHProvisioningError("Cannot parse host_info") time.sleep(iter_delay) else: check_dstack_shim_service(client) - raise ProvisioningError("Cannot get host_info") + raise SSHProvisioningError("Cannot get host_info") def get_shim_healthcheck(client: paramiko.SSHClient) -> str: @@ -240,7 +238,7 @@ def get_shim_healthcheck(client: paramiko.SSHClient) -> str: return healthcheck logger.debug("healthcheck is empty. retry") time.sleep(iter_delay) - raise ProvisioningError("Cannot get HealthcheckResponse") + raise SSHProvisioningError("Cannot get HealthcheckResponse") def _get_shim_healthcheck(client: paramiko.SSHClient) -> Optional[str]: @@ -251,9 +249,11 @@ def _get_shim_healthcheck(client: paramiko.SSHClient) -> Optional[str]: out = stdout.read().strip().decode() err = stderr.read().strip().decode() except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"get_shim_healthcheck failed: {e}") from e + raise SSHProvisioningError(f"get_shim_healthcheck failed: {e}") from e if err: - raise ProvisioningError(f"get_shim_healthcheck didn't work. stdout: {out}, stderr: {err}") + raise SSHProvisioningError( + f"get_shim_healthcheck didn't work. stdout: {out}, stderr: {err}" + ) if not out: return None return out @@ -306,7 +306,7 @@ def get_paramiko_connection( ) -> Generator[paramiko.SSHClient, None, None]: if proxy is not None: if proxy_pkeys is None: - raise ProvisioningError("Missing proxy private keys") + raise SSHProvisioningError("Missing proxy private keys") proxy_ctx = get_paramiko_connection( proxy.username, proxy.hostname, proxy.port, proxy_pkeys ) @@ -321,7 +321,7 @@ def get_paramiko_connection( try: proxy_channel = transport.open_channel("direct-tcpip", (host, port), ("", 0)) except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"Proxy channel failed: {e}") from e + raise SSHProvisioningError(f"Proxy channel failed: {e}") from e client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) for pkey in pkeys: logger.debug("Try to connect to %s with key %s", conn_url, pkey.fingerprint) @@ -333,7 +333,7 @@ def get_paramiko_connection( f'Authentication failed to connect to "{conn_url}" and {pkey.fingerprint}' ) keys_fp = ", ".join(f"{pk.fingerprint!r}" for pk in pkeys) - raise ProvisioningError( + raise SSHProvisioningError( f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful" ) @@ -347,7 +347,7 @@ def _paramiko_connect( channel: Optional[paramiko.Channel] = None, ) -> bool: """ - Returns `True` if connected, `False` if auth failed, and raises `ProvisioningError` + Returns `True` if connected, `False` if auth failed, and raises `SSHProvisioningError` on other errors. """ try: @@ -365,4 +365,4 @@ def _paramiko_connect( except paramiko.AuthenticationException: return False except (paramiko.SSHException, OSError) as e: - raise ProvisioningError(f"Connect failed: {e}") from e + raise SSHProvisioningError(f"Connect failed: {e}") from e diff --git a/src/dstack/_internal/server/services/volumes.py b/src/dstack/_internal/server/services/volumes.py index 1c846c724f..ac2f88a5d1 100644 --- a/src/dstack/_internal/server/services/volumes.py +++ b/src/dstack/_internal/server/services/volumes.py @@ -1,3 +1,4 @@ +import asyncio import uuid from datetime import datetime, timedelta from typing import List, Optional @@ -353,24 +354,29 @@ async def _delete_volumes_pipeline( await session.commit() logger.info("Deleting volumes: %s", [v.name for v in volume_models]) async with get_locker(get_db().dialect_name).lock_ctx(VolumeModel.__tablename__, volumes_ids): - # Refetch after lock - res = await session.execute( - select(VolumeModel) - .where( - VolumeModel.project_id == project.id, - VolumeModel.id.in_(volumes_ids), - VolumeModel.deleted == False, - VolumeModel.lock_expires_at.is_(None), + # Retry locking volumes to increase lock acquisition chances. + # This hack is needed until requests are queued. + volume_models = [] + for i in range(10): + res = await session.execute( + select(VolumeModel) + .where( + VolumeModel.project_id == project.id, + VolumeModel.id.in_(volumes_ids), + VolumeModel.deleted == False, + VolumeModel.lock_expires_at.is_(None), + ) + .options(selectinload(VolumeModel.attachments)) + .order_by(VolumeModel.id) # take locks in order + .with_for_update(key_share=True, of=VolumeModel) + .execution_options(populate_existing=True) ) - .options(selectinload(VolumeModel.attachments)) - .execution_options(populate_existing=True) - .order_by(VolumeModel.id) # take locks in order - .with_for_update(key_share=True, of=VolumeModel) - ) - volume_models = res.scalars().unique().all() + volume_models = res.scalars().unique().all() + if len(volume_models) == len(volumes_ids): + break + await asyncio.sleep(0.5) if len(volume_models) != len(volumes_ids): - # TODO: Make the endpoint fully async so we don't need to lock and error: - # put the request in queue and process in the background. + # TODO: Make the endpoint fully async so we don't need to lock and error. raise ServerClientError( "Failed to delete volumes: volumes are being processed currently. Try again later." ) diff --git a/src/dstack/_internal/utils/common.py b/src/dstack/_internal/utils/common.py index 2db91882ff..c761bfcc28 100644 --- a/src/dstack/_internal/utils/common.py +++ b/src/dstack/_internal/utils/common.py @@ -8,7 +8,7 @@ from datetime import datetime, timedelta, timezone from functools import partial from pathlib import Path -from typing import Any, Iterable, List, Optional, TypeVar, Union +from typing import Any, Final, Iterable, List, Optional, TypeVar, Union from urllib.parse import urlparse from uuid import UUID @@ -17,6 +17,17 @@ from dstack._internal.core.models.common import Duration +class Unset: + pass + + +UNSET: Final = Unset() +""" +Use `UNSET` as kwargs default value to distinguish between +specified and non-specified `Optional` values. +""" + + @dataclass class EntityName: name: str diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_compute_groups.py b/src/tests/_internal/server/background/pipeline_tasks/test_compute_groups.py index 6d24669f7c..776240fc47 100644 --- a/src/tests/_internal/server/background/pipeline_tasks/test_compute_groups.py +++ b/src/tests/_internal/server/background/pipeline_tasks/test_compute_groups.py @@ -1,5 +1,6 @@ +import asyncio import uuid -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from unittest.mock import Mock, patch import pytest @@ -9,7 +10,11 @@ from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.compute_groups import ComputeGroupStatus from dstack._internal.server.background.pipeline_tasks.base import PipelineItem -from dstack._internal.server.background.pipeline_tasks.compute_groups import ComputeGroupWorker +from dstack._internal.server.background.pipeline_tasks.compute_groups import ( + ComputeGroupFetcher, + ComputeGroupPipeline, + ComputeGroupWorker, +) from dstack._internal.server.models import ComputeGroupModel from dstack._internal.server.testing.common import ( ComputeMockSpec, @@ -17,6 +22,7 @@ create_fleet, create_project, ) +from dstack._internal.utils.common import get_current_datetime @pytest.fixture @@ -24,6 +30,17 @@ def worker() -> ComputeGroupWorker: return ComputeGroupWorker(queue=Mock(), heartbeater=Mock()) +@pytest.fixture +def fetcher() -> ComputeGroupFetcher: + return ComputeGroupFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=15), + lock_timeout=timedelta(seconds=30), + heartbeater=Mock(), + ) + + def _compute_group_to_pipeline_item(compute_group: ComputeGroupModel) -> PipelineItem: assert compute_group.lock_token is not None assert compute_group.lock_expires_at is not None @@ -36,9 +53,104 @@ def _compute_group_to_pipeline_item(compute_group: ComputeGroupModel) -> Pipelin ) +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestComputeGroupFetcher: + async def test_fetch_selects_eligible_compute_groups_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: ComputeGroupFetcher + ): + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + eligible = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=stale - timedelta(seconds=2), + ) + finished = await create_compute_group( + session=session, + project=project, + fleet=fleet, + status=ComputeGroupStatus.TERMINATED, + last_processed_at=stale - timedelta(seconds=1), + ) + recent = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=now, + ) + locked = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=stale, + ) + locked.lock_expires_at = now + timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert [item.id for item in items] == [eligible.id] + + for compute_group in [eligible, finished, recent, locked]: + await session.refresh(compute_group) + + assert eligible.lock_owner == ComputeGroupPipeline.__name__ + assert eligible.lock_expires_at is not None + assert eligible.lock_token is not None + + assert finished.lock_owner is None + assert recent.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_returns_oldest_compute_groups_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: ComputeGroupFetcher + ): + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + now = get_current_datetime() + + oldest = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=now - timedelta(minutes=3), + ) + middle = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=now - timedelta(minutes=2), + ) + newest = await create_compute_group( + session=session, + project=project, + fleet=fleet, + last_processed_at=now - timedelta(minutes=1), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == ComputeGroupPipeline.__name__ + assert middle.lock_owner == ComputeGroupPipeline.__name__ + assert newest.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) class TestComputeGroupWorker: - @pytest.mark.asyncio - @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_terminates_compute_group( self, test_db, session: AsyncSession, worker: ComputeGroupWorker ): @@ -64,8 +176,6 @@ async def test_terminates_compute_group( assert compute_group.status == ComputeGroupStatus.TERMINATED assert compute_group.deleted - @pytest.mark.asyncio - @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_retries_compute_group_termination( self, test_db, session: AsyncSession, worker: ComputeGroupWorker ): diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_fleets.py b/src/tests/_internal/server/background/pipeline_tasks/test_fleets.py index 746ddf2ea4..d2b53226e3 100644 --- a/src/tests/_internal/server/background/pipeline_tasks/test_fleets.py +++ b/src/tests/_internal/server/background/pipeline_tasks/test_fleets.py @@ -1,17 +1,25 @@ +import asyncio import uuid -from datetime import datetime, timezone -from unittest.mock import Mock +from datetime import datetime, timedelta, timezone +from unittest.mock import AsyncMock, Mock, patch import pytest from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from dstack._internal.core.models.fleets import FleetNodesSpec, FleetStatus -from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.core.models.fleets import ( + FleetNodesSpec, + FleetStatus, + InstanceGroupPlacement, +) +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason from dstack._internal.core.models.runs import RunStatus from dstack._internal.core.models.users import GlobalRole, ProjectRole +from dstack._internal.server.background.pipeline_tasks import fleets as fleets_pipeline from dstack._internal.server.background.pipeline_tasks.base import PipelineItem from dstack._internal.server.background.pipeline_tasks.fleets import ( + FleetFetcher, + FleetPipeline, FleetWorker, ) from dstack._internal.server.models import FleetModel, InstanceModel @@ -24,8 +32,12 @@ create_repo, create_run, create_user, + get_fleet_configuration, get_fleet_spec, + get_job_provisioning_data, + get_ssh_fleet_configuration, ) +from dstack._internal.utils.common import get_current_datetime @pytest.fixture @@ -33,6 +45,17 @@ def worker() -> FleetWorker: return FleetWorker(queue=Mock(), heartbeater=Mock()) +@pytest.fixture +def fetcher() -> FleetFetcher: + return FleetFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=60), + lock_timeout=timedelta(seconds=20), + heartbeater=Mock(), + ) + + def _fleet_to_pipeline_item(fleet: FleetModel) -> PipelineItem: assert fleet.lock_token is not None assert fleet.lock_expires_at is not None @@ -45,9 +68,735 @@ def _fleet_to_pipeline_item(fleet: FleetModel) -> PipelineItem: ) +async def _lock_fleet_for_processing(session: AsyncSession, fleet: FleetModel) -> None: + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + await session.commit() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestFleetFetcher: + async def test_fetch_selects_eligible_fleets_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: FleetFetcher + ): + project = await create_project(session) + now = get_current_datetime() + + stale = await create_fleet( + session=session, + project=project, + last_processed_at=now - timedelta(minutes=3), + ) + just_created = await create_fleet( + session=session, + project=project, + created_at=now, + last_processed_at=now, + name="just-created", + ) + deleted = await create_fleet( + session=session, + project=project, + deleted=True, + name="deleted", + last_processed_at=now - timedelta(minutes=2), + ) + recent = await create_fleet( + session=session, + project=project, + created_at=now - timedelta(minutes=2), + last_processed_at=now, + name="recent", + ) + locked = await create_fleet( + session=session, + project=project, + name="locked", + last_processed_at=now - timedelta(minutes=1, seconds=1), + ) + locked.lock_expires_at = now + timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert {item.id for item in items} == {stale.id, just_created.id} + + for fleet in [stale, just_created, deleted, recent, locked]: + await session.refresh(fleet) + + assert stale.lock_owner == FleetPipeline.__name__ + assert just_created.lock_owner == FleetPipeline.__name__ + assert stale.lock_expires_at is not None + assert just_created.lock_expires_at is not None + assert stale.lock_token is not None + assert just_created.lock_token is not None + assert len({stale.lock_token, just_created.lock_token}) == 1 + + assert deleted.lock_owner is None + assert recent.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_returns_oldest_fleets_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: FleetFetcher + ): + project = await create_project(session) + now = get_current_datetime() + + oldest = await create_fleet( + session=session, + project=project, + name="oldest", + last_processed_at=now - timedelta(minutes=4), + ) + middle = await create_fleet( + session=session, + project=project, + name="middle", + last_processed_at=now - timedelta(minutes=3), + ) + newest = await create_fleet( + session=session, + project=project, + name="newest", + last_processed_at=now - timedelta(minutes=2), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == FleetPipeline.__name__ + assert middle.lock_owner == FleetPipeline.__name__ + assert newest.lock_owner is None + + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) class TestFleetWorker: + async def test_skips_instance_locking_for_ssh_fleet( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(conf=get_ssh_fleet_configuration()), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + original_last_processed_at = fleet.last_processed_at + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + instance.lock_token = uuid.uuid4() + instance.lock_expires_at = datetime(2025, 1, 2, 3, 5, tzinfo=timezone.utc) + instance.lock_owner = "OtherPipeline" + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(instance) + assert not fleet.deleted + assert fleet.lock_owner is None + assert fleet.lock_token is None + assert fleet.lock_expires_at is None + assert fleet.last_processed_at > original_last_processed_at + assert instance.lock_owner == "OtherPipeline" + + async def test_skips_instance_locking_when_fleet_is_not_ready_for_consolidation( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + original_last_processed_at = fleet.last_processed_at + original_last_consolidated_at = datetime.now(timezone.utc) + fleet.consolidation_attempt = 1 + fleet.last_consolidated_at = original_last_consolidated_at + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + instance.lock_token = uuid.uuid4() + instance.lock_expires_at = datetime(2025, 1, 2, 3, 5, tzinfo=timezone.utc) + instance.lock_owner = "OtherPipeline" + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(instance) + assert not fleet.deleted + assert fleet.consolidation_attempt == 1 + assert fleet.last_consolidated_at == original_last_consolidated_at + assert fleet.lock_owner is None + assert fleet.lock_token is None + assert fleet.lock_expires_at is None + assert fleet.last_processed_at > original_last_processed_at + assert instance.lock_owner == "OtherPipeline" + + async def test_resets_fleet_lock_when_not_all_instances_can_be_locked( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + locked_elsewhere = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=1, + ) + original_last_processed_at = fleet.last_processed_at + fleet.lock_token = uuid.uuid4() + fleet.lock_expires_at = datetime(2025, 1, 2, 3, 4, tzinfo=timezone.utc) + fleet.lock_owner = FleetPipeline.__name__ + locked_elsewhere.lock_token = uuid.uuid4() + locked_elsewhere.lock_expires_at = datetime(2025, 1, 2, 3, 5, tzinfo=timezone.utc) + locked_elsewhere.lock_owner = "OtherPipeline" + await session.commit() + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(locked_elsewhere) + assert fleet.lock_owner == FleetPipeline.__name__ + assert fleet.lock_token is None + assert fleet.lock_expires_at is None + assert fleet.last_processed_at > original_last_processed_at + assert locked_elsewhere.lock_owner == "OtherPipeline" + + async def test_unlocks_instances_after_consolidation( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(instance) + assert instance.lock_owner is None + assert instance.lock_token is None + assert instance.lock_expires_at is None + + async def test_unlocks_instances_when_fleet_lock_token_changes_after_processing( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + spec = get_fleet_spec() + spec.configuration.nodes = FleetNodesSpec(min=1, target=1, max=1) + fleet = await create_fleet( + session=session, + project=project, + spec=spec, + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + instance_num=0, + ) + await _lock_fleet_for_processing(session, fleet) + + async def mock_process_fleet(*args, **kwargs): + fleet_model = args[0] + fleet_model.lock_token = uuid.uuid4() + return fleets_pipeline._ProcessResult() + + with patch.object( + fleets_pipeline, + "_process_fleet", + AsyncMock(side_effect=mock_process_fleet), + ): + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(instance) + assert instance.lock_owner is None + assert instance.lock_token is None + assert instance.lock_expires_at is None + + async def test_syncs_initial_current_master_for_cluster_fleet( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + first_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=1, + ) + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.current_master_instance_id == first_instance.id + + async def test_keeps_current_master_when_it_is_still_active( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + current_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PROVISIONING, + job_provisioning_data=get_job_provisioning_data(), + instance_num=1, + ) + fleet.current_master_instance_id = current_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.current_master_instance_id == current_master.id + + async def test_promotes_provisioned_survivor_when_current_master_terminated( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=1, target=1, max=2), + ) + ), + ) + terminated_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + provisioned_survivor = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + job_provisioning_data=get_job_provisioning_data(), + instance_num=1, + ) + fleet.current_master_instance_id = terminated_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(terminated_master) + assert terminated_master.deleted + assert fleet.current_master_instance_id == provisioned_survivor.id + + async def test_promotes_next_bootstrap_candidate_when_current_master_terminated( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=1, target=1, max=2), + ) + ), + ) + terminated_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + next_candidate = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=1, + ) + fleet.current_master_instance_id = terminated_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.current_master_instance_id == next_candidate.id + + async def test_does_not_elect_terminating_bootstrap_candidate_as_master( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=1, target=1, max=3), + ) + ), + ) + terminated_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATING, + job_provisioning_data=None, + offer=None, + instance_num=1, + ) + pending_candidate = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=2, + ) + fleet.current_master_instance_id = terminated_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.current_master_instance_id == pending_candidate.id + + async def test_clears_current_master_for_non_cluster_fleet( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec(), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + fleet.current_master_instance_id = instance.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + assert fleet.current_master_instance_id is None + + async def test_syncs_current_master_after_creating_missing_instances( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + instances = ( + ( + await session.execute( + select(InstanceModel) + .where(InstanceModel.fleet_id == fleet.id, InstanceModel.deleted == False) + .order_by(InstanceModel.instance_num, InstanceModel.created_at) + ) + ) + .scalars() + .all() + ) + assert len(instances) == 2 + assert fleet.current_master_instance_id == instances[0].id + + async def test_prefers_surviving_instance_over_new_replacement_for_master_election( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + terminated_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + surviving_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=1, + ) + fleet.current_master_instance_id = terminated_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(terminated_master) + await session.refresh(surviving_instance) + non_deleted_instances = ( + ( + await session.execute( + select(InstanceModel) + .where(InstanceModel.fleet_id == fleet.id, InstanceModel.deleted == False) + .order_by(InstanceModel.instance_num, InstanceModel.created_at) + ) + ) + .scalars() + .all() + ) + + assert terminated_master.deleted + assert fleet.current_master_instance_id == surviving_instance.id + assert len(non_deleted_instances) == 2 + assert any( + instance.id != surviving_instance.id and instance.instance_num == 0 + for instance in non_deleted_instances + ) + + async def test_min_zero_failed_master_terminates_unprovisioned_siblings( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=0, target=3, max=3), + ) + ), + ) + failed_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + failed_master.termination_reason = InstanceTerminationReason.NO_OFFERS + sibling1 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=1, + ) + sibling2 = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=2, + ) + fleet.current_master_instance_id = failed_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(failed_master) + await session.refresh(sibling1) + await session.refresh(sibling2) + assert failed_master.deleted + assert sibling1.status == InstanceStatus.TERMINATED + assert sibling2.status == InstanceStatus.TERMINATED + assert sibling1.termination_reason == InstanceTerminationReason.MASTER_FAILED + assert sibling2.termination_reason == InstanceTerminationReason.MASTER_FAILED + assert fleet.current_master_instance_id is None + + async def test_min_zero_failed_master_preserves_provisioned_survivor( + self, test_db, session: AsyncSession, worker: FleetWorker + ): + project = await create_project(session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=0, target=2, max=2), + ) + ), + ) + failed_master = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.TERMINATED, + job_provisioning_data=None, + offer=None, + instance_num=0, + ) + failed_master.termination_reason = InstanceTerminationReason.NO_OFFERS + provisioned_survivor = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + job_provisioning_data=get_job_provisioning_data(), + instance_num=1, + ) + pending_sibling = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + job_provisioning_data=None, + offer=None, + instance_num=2, + ) + fleet.current_master_instance_id = failed_master.id + await _lock_fleet_for_processing(session, fleet) + + await worker.process(_fleet_to_pipeline_item(fleet)) + + await session.refresh(fleet) + await session.refresh(provisioned_survivor) + await session.refresh(pending_sibling) + assert provisioned_survivor.status == InstanceStatus.IDLE + assert pending_sibling.status == InstanceStatus.PENDING + assert pending_sibling.termination_reason is None + assert fleet.current_master_instance_id == provisioned_survivor.id + async def test_deletes_empty_autocreated_fleet( self, test_db, session: AsyncSession, worker: FleetWorker ): @@ -392,7 +1141,6 @@ async def test_consolidation_attempt_resets_when_no_changes( ) assert len(instances) == 1 assert fleet.consolidation_attempt == 0 - assert ( - fleet.last_consolidated_at is not None - and fleet.last_consolidated_at > previous_last_consolidated_at - ) + last_consolidated_at = fleet.last_consolidated_at + assert last_consolidated_at + assert last_consolidated_at > previous_last_consolidated_at diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py b/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py index 59cbd370e9..a1d7b360f9 100644 --- a/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py +++ b/src/tests/_internal/server/background/pipeline_tasks/test_gateways.py @@ -1,5 +1,6 @@ +import asyncio import uuid -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from unittest.mock import MagicMock, Mock, patch import pytest @@ -10,6 +11,8 @@ from dstack._internal.core.errors import BackendError from dstack._internal.core.models.gateways import GatewayProvisioningData, GatewayStatus from dstack._internal.server.background.pipeline_tasks.gateways import ( + GatewayFetcher, + GatewayPipeline, GatewayPipelineItem, GatewayWorker, ) @@ -23,6 +26,7 @@ create_project, list_events, ) +from dstack._internal.utils.common import get_current_datetime @pytest.fixture @@ -30,6 +34,17 @@ def worker() -> GatewayWorker: return GatewayWorker(queue=Mock(), heartbeater=Mock()) +@pytest.fixture +def fetcher() -> GatewayFetcher: + return GatewayFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=15), + lock_timeout=timedelta(seconds=30), + heartbeater=Mock(), + ) + + def _gateway_to_pipeline_item(gateway_model: GatewayModel) -> GatewayPipelineItem: assert gateway_model.lock_token is not None assert gateway_model.lock_expires_at is not None @@ -44,6 +59,167 @@ def _gateway_to_pipeline_item(gateway_model: GatewayModel) -> GatewayPipelineIte ) +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestGatewayFetcher: + async def test_fetch_selects_eligible_gateways_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: GatewayFetcher + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + submitted = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="submitted", + status=GatewayStatus.SUBMITTED, + last_processed_at=stale - timedelta(seconds=3), + ) + provisioning = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="provisioning", + status=GatewayStatus.PROVISIONING, + last_processed_at=stale - timedelta(seconds=2), + ) + to_be_deleted = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="to-be-deleted", + status=GatewayStatus.RUNNING, + last_processed_at=stale - timedelta(seconds=1), + ) + to_be_deleted.to_be_deleted = True + + just_created = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="just-created", + status=GatewayStatus.SUBMITTED, + last_processed_at=now, + ) + just_created.created_at = now + just_created.last_processed_at = now + + ineligible_status = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="ineligible-status", + status=GatewayStatus.RUNNING, + last_processed_at=stale, + ) + recent = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="recent", + status=GatewayStatus.SUBMITTED, + last_processed_at=now, + ) + recent.created_at = now - timedelta(minutes=2) + recent.last_processed_at = now + + locked = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="locked", + status=GatewayStatus.SUBMITTED, + last_processed_at=stale + timedelta(seconds=1), + ) + locked.lock_expires_at = now + timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert {item.id for item in items} == { + submitted.id, + provisioning.id, + to_be_deleted.id, + just_created.id, + } + assert {(item.id, item.status, item.to_be_deleted) for item in items} == { + (submitted.id, GatewayStatus.SUBMITTED, False), + (provisioning.id, GatewayStatus.PROVISIONING, False), + (to_be_deleted.id, GatewayStatus.RUNNING, True), + (just_created.id, GatewayStatus.SUBMITTED, False), + } + + for gateway in [ + submitted, + provisioning, + to_be_deleted, + just_created, + ineligible_status, + recent, + locked, + ]: + await session.refresh(gateway) + + fetched_gateways = [submitted, provisioning, to_be_deleted, just_created] + assert all(gateway.lock_owner == GatewayPipeline.__name__ for gateway in fetched_gateways) + assert all(gateway.lock_expires_at is not None for gateway in fetched_gateways) + assert all(gateway.lock_token is not None for gateway in fetched_gateways) + assert len({gateway.lock_token for gateway in fetched_gateways}) == 1 + + assert ineligible_status.lock_owner is None + assert recent.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_returns_oldest_gateways_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: GatewayFetcher + ): + project = await create_project(session=session) + backend = await create_backend(session=session, project_id=project.id) + now = get_current_datetime() + + oldest = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="oldest", + status=GatewayStatus.SUBMITTED, + last_processed_at=now - timedelta(minutes=3), + ) + middle = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="middle", + status=GatewayStatus.PROVISIONING, + last_processed_at=now - timedelta(minutes=2), + ) + newest = await create_gateway( + session=session, + project_id=project.id, + backend_id=backend.id, + name="newest", + status=GatewayStatus.SUBMITTED, + last_processed_at=now - timedelta(minutes=1), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == GatewayPipeline.__name__ + assert middle.lock_owner == GatewayPipeline.__name__ + assert newest.lock_owner is None + + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) class TestGatewayWorkerSubmitted: diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/__init__.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/conftest.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/conftest.py new file mode 100644 index 0000000000..f7600e0ba6 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/conftest.py @@ -0,0 +1,58 @@ +import asyncio +import datetime as dt +from unittest.mock import Mock + +import pytest + +from dstack._internal.core.backends.base.compute import GoArchType +from dstack._internal.server.background.pipeline_tasks.instances import ( + InstanceFetcher, + InstanceWorker, +) +from dstack._internal.server.background.pipeline_tasks.instances import ( + ssh_deploy as instances_ssh_deploy, +) +from dstack._internal.server.schemas.instances import InstanceCheck + + +@pytest.fixture +def fetcher() -> InstanceFetcher: + return InstanceFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=dt.timedelta(seconds=10), + lock_timeout=dt.timedelta(seconds=30), + heartbeater=Mock(), + ) + + +@pytest.fixture +def worker() -> InstanceWorker: + return InstanceWorker(queue=asyncio.Queue(), heartbeater=Mock()) + + +@pytest.fixture +def host_info() -> dict: + return { + "gpu_vendor": "nvidia", + "gpu_name": "T4", + "gpu_memory": 16384, + "gpu_count": 1, + "addresses": ["192.168.100.100/24"], + "disk_size": 260976517120, + "cpus": 32, + "memory": 33544130560, + } + + +@pytest.fixture +def deploy_instance_mock(monkeypatch: pytest.MonkeyPatch, host_info: dict) -> Mock: + mock = Mock( + return_value=( + InstanceCheck(reachable=True), + host_info, + GoArchType.AMD64, + ) + ) + monkeypatch.setattr(instances_ssh_deploy, "_deploy_instance", mock) + return mock diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/helpers.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/helpers.py new file mode 100644 index 0000000000..81eb0fde5c --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/helpers.py @@ -0,0 +1,40 @@ +import datetime as dt +import uuid + +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.server.background.pipeline_tasks.instances import ( + InstancePipeline, + InstancePipelineItem, + InstanceWorker, +) +from dstack._internal.server.models import InstanceModel + +LOCK_EXPIRES_AT = dt.datetime(2025, 1, 2, 3, 4, tzinfo=dt.timezone.utc) + + +def instance_to_pipeline_item(instance_model: InstanceModel) -> InstancePipelineItem: + assert instance_model.lock_token is not None + assert instance_model.lock_expires_at is not None + return InstancePipelineItem( + __tablename__=instance_model.__tablename__, + id=instance_model.id, + lock_token=instance_model.lock_token, + lock_expires_at=instance_model.lock_expires_at, + prev_lock_expired=False, + status=instance_model.status, + ) + + +def lock_instance(instance_model: InstanceModel) -> None: + instance_model.lock_token = uuid.uuid4() + instance_model.lock_expires_at = LOCK_EXPIRES_AT + instance_model.lock_owner = InstancePipeline.__name__ + + +async def process_instance( + session: AsyncSession, worker: InstanceWorker, instance_model: InstanceModel +) -> None: + lock_instance(instance_model) + await session.commit() + await worker.process(instance_to_pipeline_item(instance_model)) diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_check.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_check.py new file mode 100644 index 0000000000..b555556881 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_check.py @@ -0,0 +1,944 @@ +import datetime as dt +import logging +from unittest.mock import Mock + +import pytest +import pytest_asyncio +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.fleets import FleetNodesSpec +from dstack._internal.core.models.health import HealthStatus +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.core.models.profiles import TerminationPolicy +from dstack._internal.core.models.runs import JobStatus +from dstack._internal.server.background.pipeline_tasks.instances import InstanceWorker +from dstack._internal.server.background.pipeline_tasks.instances import check as instances_check +from dstack._internal.server.models import InstanceHealthCheckModel, InstanceModel +from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse, DCGMHealthResult +from dstack._internal.server.schemas.instances import InstanceCheck +from dstack._internal.server.schemas.runner import ( + ComponentInfo, + ComponentName, + ComponentStatus, + HealthcheckResponse, + InstanceHealthResponse, + TaskListResponse, +) +from dstack._internal.server.services.runner.client import ComponentList, ShimClient +from dstack._internal.server.testing.common import ( + create_fleet, + create_instance, + create_job, + create_project, + create_repo, + create_run, + create_user, + get_fleet_configuration, + get_fleet_spec, + get_remote_connection_info, + list_events, +) +from dstack._internal.utils.common import get_current_datetime +from tests._internal.server.background.pipeline_tasks.test_instances.helpers import ( + process_instance, +) + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("image_config_mock") +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestCheckInstance: + async def test_check_shim_transitions_provisioning_on_ready( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PROVISIONING, + ) + instance.termination_deadline = get_current_datetime() + dt.timedelta(days=1) + await session.commit() + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=True)), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.IDLE + assert instance.termination_deadline is None + + async def test_check_shim_transitions_provisioning_on_terminating( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PROVISIONING, + ) + instance.started_at = get_current_datetime() + dt.timedelta(minutes=-20) + await session.commit() + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=False, message="Shim problem")), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.TERMINATING + assert instance.termination_deadline is not None + + async def test_check_shim_transitions_provisioning_on_busy( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + user = await create_user(session=session) + project = await create_project(session=session, owner=user) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PROVISIONING, + ) + instance.termination_deadline = get_current_datetime().replace( + tzinfo=dt.timezone.utc + ) + dt.timedelta(days=1) + job = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + instance=instance, + ) + await session.commit() + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=True)), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + await session.refresh(job) + + assert instance.status == InstanceStatus.BUSY + assert instance.termination_deadline is None + assert job.instance == instance + + async def test_check_shim_start_termination_deadline( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + unreachable=False, + ) + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=False, message="SSH connection fail")), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.IDLE + assert instance.unreachable is True + assert instance.termination_deadline is not None + assert instance.termination_deadline.replace( + tzinfo=dt.timezone.utc + ) > get_current_datetime() + dt.timedelta(minutes=19) + + async def test_check_shim_does_not_start_termination_deadline_with_ssh_instance( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + unreachable=False, + remote_connection_info=get_remote_connection_info(), + ) + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=False, message="SSH connection fail")), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.IDLE + assert instance.unreachable is True + assert instance.termination_deadline is None + + async def test_check_shim_stop_termination_deadline( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + ) + instance.termination_deadline = get_current_datetime() + dt.timedelta(minutes=19) + await session.commit() + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=True)), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.IDLE + assert instance.termination_deadline is None + + async def test_check_shim_terminate_instance_by_deadline( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + ) + termination_deadline_time = get_current_datetime() + dt.timedelta(minutes=-19) + instance.termination_deadline = termination_deadline_time + await session.commit() + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=False, message="Not ok")), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.TERMINATING + assert instance.termination_deadline == termination_deadline_time + assert instance.termination_reason == InstanceTerminationReason.UNREACHABLE + + @pytest.mark.parametrize( + ["termination_policy", "has_job"], + [ + pytest.param(TerminationPolicy.DESTROY_AFTER_IDLE, False, id="destroy-no-job"), + pytest.param(TerminationPolicy.DESTROY_AFTER_IDLE, True, id="destroy-with-job"), + pytest.param(TerminationPolicy.DONT_DESTROY, False, id="dont-destroy-no-job"), + pytest.param(TerminationPolicy.DONT_DESTROY, True, id="dont-destroy-with-job"), + ], + ) + async def test_check_shim_process_unreachable_state( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + termination_policy: TerminationPolicy, + has_job: bool, + ): + project = await create_project(session=session) + if has_job: + user = await create_user(session=session) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run(session=session, project=project, repo=repo, user=user) + job = await create_job( + session=session, + run=run, + status=JobStatus.SUBMITTED, + ) + else: + job = None + instance = await create_instance( + session=session, + project=project, + created_at=get_current_datetime(), + termination_policy=termination_policy, + status=InstanceStatus.IDLE, + unreachable=True, + job=job, + ) + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=True)), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + events = await list_events(session) + + assert instance.status == InstanceStatus.IDLE + assert instance.unreachable is False + assert len(events) == 1 + assert events[0].message == "Instance became reachable" + + @pytest.mark.parametrize("health_status", [HealthStatus.HEALTHY, HealthStatus.FAILURE]) + async def test_check_shim_switch_to_unreachable_state( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + health_status: HealthStatus, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + unreachable=False, + health_status=health_status, + ) + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=False)), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + events = await list_events(session) + + assert instance.status == InstanceStatus.IDLE + assert instance.unreachable is True + assert instance.health == health_status + assert len(events) == 1 + assert events[0].message == "Instance became unreachable" + + async def test_check_shim_check_instance_health( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + unreachable=False, + health_status=HealthStatus.HEALTHY, + ) + health_response = InstanceHealthResponse( + dcgm=DCGMHealthResponse( + overall_health=DCGMHealthResult.DCGM_HEALTH_RESULT_WARN, + incidents=[], + ) + ) + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock( + return_value=InstanceCheck( + reachable=True, + health_response=health_response, + ) + ), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + events = await list_events(session) + + assert instance.status == InstanceStatus.IDLE + assert instance.unreachable is False + assert instance.health == HealthStatus.WARNING + assert len(events) == 1 + assert events[0].message == "Instance health changed HEALTHY -> WARNING" + + res = await session.execute(select(InstanceHealthCheckModel)) + health_check = res.scalars().one() + assert health_check.status == HealthStatus.WARNING + assert health_check.response == health_response.json() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestProcessIdleTimeout: + async def test_does_not_terminate_by_idle_timeout_when_fleet_at_min_nodes( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + get_fleet_configuration(nodes=FleetNodesSpec(min=1, target=1, max=1)) + ), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + instance.termination_idle_time = 300 + instance.termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE + instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19) + await session.commit() + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=True)), + ) + + await process_instance(session, worker, instance) + await session.refresh(instance) + + assert instance.status == InstanceStatus.IDLE + assert instance.termination_reason is None + + async def test_terminates_by_idle_timeout_when_fleet_above_min_nodes( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session=session, + project=project, + spec=get_fleet_spec( + get_fleet_configuration(nodes=FleetNodesSpec(min=1, target=2, max=2)) + ), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + ) + instance.termination_idle_time = 300 + instance.termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE + instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19) + await session.commit() + + await process_instance(session, worker, instance) + await session.refresh(instance) + + assert instance.status == InstanceStatus.TERMINATING + assert instance.termination_reason == InstanceTerminationReason.IDLE_TIMEOUT + + async def test_terminate_by_idle_timeout( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + ) + instance.termination_idle_time = 300 + instance.termination_policy = TerminationPolicy.DESTROY_AFTER_IDLE + instance.last_job_processed_at = get_current_datetime() + dt.timedelta(minutes=-19) + await session.commit() + + await process_instance(session, worker, instance) + await session.refresh(instance) + + assert instance.status == InstanceStatus.TERMINATING + assert instance.termination_reason == InstanceTerminationReason.IDLE_TIMEOUT + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class BaseTestMaybeInstallComponents: + EXPECTED_VERSION = "0.20.1" + + @pytest_asyncio.fixture + async def instance(self, session: AsyncSession) -> InstanceModel: + project = await create_project(session=session) + return await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + + @pytest.fixture + def component_list(self) -> ComponentList: + return ComponentList() + + @pytest.fixture + def debug_task_log(self, caplog: pytest.LogCaptureFixture) -> pytest.LogCaptureFixture: + caplog.set_level(level=logging.DEBUG, logger=instances_check.__name__) + return caplog + + @pytest.fixture + def shim_client_mock( + self, + monkeypatch: pytest.MonkeyPatch, + component_list: ComponentList, + ) -> Mock: + mock = Mock(spec_set=ShimClient) + mock.healthcheck.return_value = HealthcheckResponse( + service="dstack-shim", + version=self.EXPECTED_VERSION, + ) + mock.get_instance_health.return_value = InstanceHealthResponse() + mock.get_components.return_value = component_list + mock.list_tasks.return_value = TaskListResponse(tasks=[]) + mock.is_safe_to_restart.return_value = False + monkeypatch.setattr( + "dstack._internal.server.services.runner.client.ShimClient", + Mock(return_value=mock), + ) + return mock + + +@pytest.mark.usefixtures("get_dstack_runner_version_mock") +class TestMaybeInstallRunner(BaseTestMaybeInstallComponents): + @pytest.fixture + def component_list(self) -> ComponentList: + components = ComponentList() + components.add( + ComponentInfo( + name=ComponentName.RUNNER, + version=self.EXPECTED_VERSION, + status=ComponentStatus.INSTALLED, + ), + ) + return components + + @pytest.fixture + def get_dstack_runner_version_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value=self.EXPECTED_VERSION) + monkeypatch.setattr(instances_check, "get_dstack_runner_version", mock) + return mock + + @pytest.fixture + def get_dstack_runner_download_url_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value="https://example.com/runner") + monkeypatch.setattr(instances_check, "get_dstack_runner_download_url", mock) + return mock + + async def test_cannot_determine_expected_version( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_runner_version_mock: Mock, + ): + get_dstack_runner_version_mock.return_value = None + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_runner.assert_not_called() + + async def test_expected_version_already_installed( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + ): + shim_client_mock.get_components.return_value.runner.version = self.EXPECTED_VERSION + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert "expected runner version already installed" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_runner.assert_not_called() + + @pytest.mark.parametrize("status", [ComponentStatus.NOT_INSTALLED, ComponentStatus.ERROR]) + async def test_install_not_installed_or_error( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_runner_download_url_mock: Mock, + status: ComponentStatus, + ): + shim_client_mock.get_components.return_value.runner.version = "" + shim_client_mock.get_components.return_value.runner.status = status + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert f"installing runner (no version) -> {self.EXPECTED_VERSION}" in debug_task_log.text + get_dstack_runner_download_url_mock.assert_called_once_with( + arch=None, + version=self.EXPECTED_VERSION, + ) + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_runner.assert_called_once_with( + get_dstack_runner_download_url_mock.return_value + ) + + @pytest.mark.parametrize("installed_version", ["0.19.40", "0.21.0", "dev"]) + async def test_install_installed( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_runner_download_url_mock: Mock, + installed_version: str, + ): + shim_client_mock.get_components.return_value.runner.version = installed_version + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert ( + f"installing runner {installed_version} -> {self.EXPECTED_VERSION}" + in debug_task_log.text + ) + get_dstack_runner_download_url_mock.assert_called_once_with( + arch=None, + version=self.EXPECTED_VERSION, + ) + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_runner.assert_called_once_with( + get_dstack_runner_download_url_mock.return_value + ) + + async def test_already_installing( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + ): + shim_client_mock.get_components.return_value.runner.version = "dev" + shim_client_mock.get_components.return_value.runner.status = ComponentStatus.INSTALLING + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert "runner is already being installed" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_runner.assert_not_called() + + +@pytest.mark.usefixtures("get_dstack_shim_version_mock") +class TestMaybeInstallShim(BaseTestMaybeInstallComponents): + @pytest.fixture + def component_list(self) -> ComponentList: + components = ComponentList() + components.add( + ComponentInfo( + name=ComponentName.SHIM, + version=self.EXPECTED_VERSION, + status=ComponentStatus.INSTALLED, + ), + ) + return components + + @pytest.fixture + def get_dstack_shim_version_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value=self.EXPECTED_VERSION) + monkeypatch.setattr(instances_check, "get_dstack_shim_version", mock) + return mock + + @pytest.fixture + def get_dstack_shim_download_url_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value="https://example.com/shim") + monkeypatch.setattr(instances_check, "get_dstack_shim_download_url", mock) + return mock + + async def test_cannot_determine_expected_version( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_shim_version_mock: Mock, + ): + get_dstack_shim_version_mock.return_value = None + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_not_called() + + async def test_expected_version_already_installed( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + ): + shim_client_mock.get_components.return_value.shim.version = self.EXPECTED_VERSION + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert "expected shim version already installed" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_not_called() + + @pytest.mark.parametrize("status", [ComponentStatus.NOT_INSTALLED, ComponentStatus.ERROR]) + async def test_install_not_installed_or_error( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_shim_download_url_mock: Mock, + status: ComponentStatus, + ): + shim_client_mock.get_components.return_value.shim.version = "" + shim_client_mock.get_components.return_value.shim.status = status + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert f"installing shim (no version) -> {self.EXPECTED_VERSION}" in debug_task_log.text + get_dstack_shim_download_url_mock.assert_called_once_with( + arch=None, + version=self.EXPECTED_VERSION, + ) + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_called_once_with( + get_dstack_shim_download_url_mock.return_value + ) + + @pytest.mark.parametrize("installed_version", ["0.19.40", "0.21.0", "dev"]) + async def test_install_installed( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + get_dstack_shim_download_url_mock: Mock, + installed_version: str, + ): + shim_client_mock.get_components.return_value.shim.version = installed_version + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert ( + f"installing shim {installed_version} -> {self.EXPECTED_VERSION}" + in debug_task_log.text + ) + get_dstack_shim_download_url_mock.assert_called_once_with( + arch=None, + version=self.EXPECTED_VERSION, + ) + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_called_once_with( + get_dstack_shim_download_url_mock.return_value + ) + + async def test_already_installing( + self, + test_db, + instance: InstanceModel, + debug_task_log: pytest.LogCaptureFixture, + shim_client_mock: Mock, + ): + shim_client_mock.get_components.return_value.shim.version = "dev" + shim_client_mock.get_components.return_value.shim.status = ComponentStatus.INSTALLING + + instances_check._maybe_install_components(instance, shim_client_mock) + + assert "shim is already being installed" in debug_task_log.text + shim_client_mock.get_components.assert_called_once() + shim_client_mock.install_shim.assert_not_called() + + +@pytest.mark.usefixtures("maybe_install_runner_mock", "maybe_install_shim_mock") +class TestMaybeRestartShim(BaseTestMaybeInstallComponents): + @pytest.fixture + def component_list(self) -> ComponentList: + components = ComponentList() + components.add( + ComponentInfo( + name=ComponentName.RUNNER, + version=self.EXPECTED_VERSION, + status=ComponentStatus.INSTALLED, + ), + ) + components.add( + ComponentInfo( + name=ComponentName.SHIM, + version=self.EXPECTED_VERSION, + status=ComponentStatus.INSTALLED, + ), + ) + return components + + @pytest.fixture + def maybe_install_runner_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value=False) + monkeypatch.setattr(instances_check, "_maybe_install_runner", mock) + return mock + + @pytest.fixture + def maybe_install_shim_mock(self, monkeypatch: pytest.MonkeyPatch) -> Mock: + mock = Mock(return_value=False) + monkeypatch.setattr(instances_check, "_maybe_install_shim", mock) + return mock + + async def test_up_to_date(self, test_db, instance: InstanceModel, shim_client_mock: Mock): + shim_client_mock.get_version_string.return_value = self.EXPECTED_VERSION + shim_client_mock.is_safe_to_restart.return_value = True + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_no_shim_component_info( + self, test_db, instance: InstanceModel, shim_client_mock: Mock + ): + shim_client_mock.get_components.return_value = ComponentList() + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_shutdown_requested( + self, test_db, instance: InstanceModel, shim_client_mock: Mock + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_called_once_with(force=False) + + async def test_outdated_but_task_wont_survive_restart( + self, test_db, instance: InstanceModel, shim_client_mock: Mock + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = False + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_but_runner_installation_in_progress( + self, + test_db, + instance: InstanceModel, + shim_client_mock: Mock, + component_list: ComponentList, + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + runner_info = component_list.runner + assert runner_info is not None + runner_info.status = ComponentStatus.INSTALLING + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_but_shim_installation_in_progress( + self, + test_db, + instance: InstanceModel, + shim_client_mock: Mock, + component_list: ComponentList, + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + shim_info = component_list.shim + assert shim_info is not None + shim_info.status = ComponentStatus.INSTALLING + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_but_runner_installation_requested( + self, + test_db, + instance: InstanceModel, + shim_client_mock: Mock, + maybe_install_runner_mock: Mock, + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + maybe_install_runner_mock.return_value = True + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() + + async def test_outdated_but_shim_installation_requested( + self, + test_db, + instance: InstanceModel, + shim_client_mock: Mock, + maybe_install_shim_mock: Mock, + ): + shim_client_mock.get_version_string.return_value = "outdated" + shim_client_mock.is_safe_to_restart.return_value = True + maybe_install_shim_mock.return_value = True + + instances_check._maybe_install_components(instance, shim_client_mock) + + shim_client_mock.get_components.assert_called_once() + shim_client_mock.shutdown.assert_not_called() diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_cloud_provisioning.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_cloud_provisioning.py new file mode 100644 index 0000000000..afcb75336b --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_cloud_provisioning.py @@ -0,0 +1,872 @@ +from typing import Optional +from unittest.mock import Mock, patch + +import gpuhunt +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import NoCapacityError, ProvisioningError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.fleets import FleetNodesSpec, InstanceGroupPlacement +from dstack._internal.core.models.instances import ( + Gpu, + InstanceAvailability, + InstanceOffer, + InstanceOfferWithAvailability, + InstanceStatus, + InstanceTerminationReason, + InstanceType, + Resources, +) +from dstack._internal.core.models.placement import PlacementGroup, PlacementGroupProvisioningData +from dstack._internal.core.models.runs import JobProvisioningData +from dstack._internal.server.background.pipeline_tasks.instances import InstanceWorker +from dstack._internal.server.models import PlacementGroupModel +from dstack._internal.server.testing.common import ( + ComputeMockSpec, + create_fleet, + create_instance, + create_placement_group, + create_project, + get_fleet_configuration, + get_fleet_spec, + get_instance_offer_with_availability, + get_job_provisioning_data, + get_placement_group_provisioning_data, +) +from tests._internal.server.background.pipeline_tasks.test_instances.helpers import ( + instance_to_pipeline_item, + lock_instance, + process_instance, +) + + +async def _set_current_master_instance(session: AsyncSession, fleet, instance) -> None: + fleet.current_master_instance_id = None if instance is None else instance.id + await session.commit() + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestCloudProvisioning: + @pytest.mark.parametrize( + ["cpus", "gpus", "requested_blocks", "expected_blocks"], + [ + pytest.param(32, 8, 1, 1, id="gpu-instance-no-blocks"), + pytest.param(32, 8, 2, 2, id="gpu-instance-four-gpu-per-block"), + pytest.param(32, 8, 4, 4, id="gpu-instance-two-gpus-per-block"), + pytest.param(32, 8, None, 8, id="gpu-instance-auto-max-gpu"), + pytest.param(4, 8, None, 4, id="gpu-instance-auto-max-cpu"), + pytest.param(8, 8, None, 8, id="gpu-instance-auto-max-cpu-and-gpu"), + pytest.param(32, 0, 1, 1, id="cpu-instance-no-blocks"), + pytest.param(32, 0, 2, 2, id="cpu-instance-four-cpu-per-block"), + pytest.param(32, 0, 4, 4, id="cpu-instance-two-cpus-per-block"), + pytest.param(32, 0, None, 32, id="cpu-instance-auto-max-cpu"), + ], + ) + async def test_creates_instance( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + cpus: int, + gpus: int, + requested_blocks: Optional[int], + expected_blocks: int, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + total_blocks=requested_blocks, + busy_blocks=0, + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + backend_mock = Mock() + m.return_value = [backend_mock] + backend_mock.TYPE = BackendType.AWS + gpu = Gpu(name="T4", memory_mib=16384, vendor=gpuhunt.AcceleratorVendor.NVIDIA) + offer = InstanceOfferWithAvailability( + backend=BackendType.AWS, + instance=InstanceType( + name="instance", + resources=Resources( + cpus=cpus, + memory_mib=131072, + spot=False, + gpus=[gpu] * gpus, + ), + ), + region="us", + price=1.0, + availability=InstanceAvailability.AVAILABLE, + total_blocks=expected_blocks, + ) + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [offer] + backend_mock.compute.return_value.create_instance.return_value = JobProvisioningData( + backend=offer.backend, + instance_type=offer.instance, + instance_id="instance_id", + hostname="1.1.1.1", + internal_ip=None, + region=offer.region, + price=offer.price, + username="ubuntu", + ssh_port=22, + ssh_proxy=None, + dockerized=True, + backend_data=None, + ) + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PROVISIONING + assert instance.total_blocks == expected_blocks + assert instance.busy_blocks == 0 + + @pytest.mark.parametrize("err", [RuntimeError("Unexpected"), ProvisioningError("Expected")]) + async def test_tries_second_offer_if_first_fails( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + err: Exception, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + ) + aws_mock = Mock() + aws_mock.TYPE = BackendType.AWS + offer = get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0) + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + aws_mock.compute.return_value.get_offers.return_value = [offer] + aws_mock.compute.return_value.create_instance.side_effect = err + gcp_mock = Mock() + gcp_mock.TYPE = BackendType.GCP + offer = get_instance_offer_with_availability(backend=BackendType.GCP, price=2.0) + gcp_mock.compute.return_value = Mock(spec=ComputeMockSpec) + gcp_mock.compute.return_value.get_offers.return_value = [offer] + gcp_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=offer.backend, + region=offer.region, + price=offer.price, + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [aws_mock, gcp_mock] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PROVISIONING + aws_mock.compute.return_value.create_instance.assert_called_once() + assert instance.backend == BackendType.GCP + + @pytest.mark.parametrize("err", [RuntimeError("Unexpected"), ProvisioningError("Expected")]) + async def test_fails_if_all_offers_fail( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + err: Exception, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + ) + aws_mock = Mock() + aws_mock.TYPE = BackendType.AWS + offer = get_instance_offer_with_availability(backend=BackendType.AWS, price=1.0) + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + aws_mock.compute.return_value.get_offers.return_value = [offer] + aws_mock.compute.return_value.create_instance.side_effect = err + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [aws_mock] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.NO_OFFERS + + async def test_fails_if_no_offers( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.NO_OFFERS + + async def test_waits_when_fleet_has_no_current_master( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=0, + ) + + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PENDING + assert backend_mock.compute.return_value.create_instance.call_count == 0 + + async def test_waits_for_current_master_to_determine_cluster_placement( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=0, + ) + sibling_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=1, + ) + await _set_current_master_instance(session, fleet, master_instance) + + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, sibling_instance) + + await session.refresh(master_instance) + await session.refresh(sibling_instance) + assert master_instance.status == InstanceStatus.PENDING + assert sibling_instance.status == InstanceStatus.PENDING + assert backend_mock.compute.return_value.create_instance.call_count == 0 + + async def test_failed_master_does_not_provision_stale_sibling_until_fleet_reassigns_it( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=0, + ) + sibling_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=1, + ) + await _set_current_master_instance(session, fleet, master_instance) + + lock_instance(master_instance) + lock_instance(sibling_instance) + await session.commit() + master_item = instance_to_pipeline_item(master_instance) + sibling_item = instance_to_pipeline_item(sibling_instance) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [] + await worker.process(master_item) + + await session.refresh(master_instance) + await session.refresh(sibling_instance) + assert master_instance.status == InstanceStatus.TERMINATED + assert master_instance.termination_reason == InstanceTerminationReason.NO_OFFERS + assert sibling_instance.status == InstanceStatus.PENDING + + gcp_mock = Mock() + gcp_mock.TYPE = BackendType.GCP + gcp_mock.compute.return_value = Mock(spec=ComputeMockSpec) + gcp_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.GCP, region="us-central1") + ] + gcp_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.GCP, + region="us-central1", + ) + aws_mock = Mock() + aws_mock.TYPE = BackendType.AWS + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + aws_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, region="us-east-1") + ] + aws_mock.compute.return_value.create_placement_group.return_value = ( + get_placement_group_provisioning_data() + ) + aws_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ) + + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [gcp_mock, aws_mock] + await worker.process(sibling_item) + + await session.refresh(sibling_instance) + assert sibling_instance.status == InstanceStatus.PENDING + assert gcp_mock.compute.return_value.get_offers.call_count == 0 + assert gcp_mock.compute.return_value.create_instance.call_count == 0 + assert aws_mock.compute.return_value.create_instance.call_count == 0 + + await _set_current_master_instance(session, fleet, sibling_instance) + promoted_backend_mock = Mock() + promoted_backend_mock.TYPE = BackendType.AWS + promoted_backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + promoted_backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, region="us-east-1") + ] + promoted_backend_mock.compute.return_value.create_placement_group.return_value = ( + get_placement_group_provisioning_data() + ) + promoted_backend_mock.compute.return_value.create_instance.return_value = ( + get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ) + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [promoted_backend_mock] + await process_instance(session, worker, sibling_instance) + + await session.refresh(sibling_instance) + assert sibling_instance.status == InstanceStatus.PROVISIONING + assert sibling_instance.backend == BackendType.AWS + assert sibling_instance.region == "us-east-1" + assert promoted_backend_mock.compute.return_value.create_instance.call_count == 1 + + async def test_follows_current_master_backend_and_region_constraints( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + job_provisioning_data=get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ), + instance_num=0, + ) + sibling_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=1, + ) + await _set_current_master_instance(session, fleet, master_instance) + + gcp_mock = Mock() + gcp_mock.TYPE = BackendType.GCP + gcp_mock.compute.return_value = Mock(spec=ComputeMockSpec) + gcp_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.GCP, region="us-central1") + ] + gcp_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.GCP, + region="us-central1", + ) + aws_mock = Mock() + aws_mock.TYPE = BackendType.AWS + aws_mock.compute.return_value = Mock(spec=ComputeMockSpec) + aws_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, region="us-east-1") + ] + aws_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [gcp_mock, aws_mock] + await process_instance(session, worker, sibling_instance) + + await session.refresh(sibling_instance) + assert sibling_instance.status == InstanceStatus.PROVISIONING + assert sibling_instance.backend == BackendType.AWS + assert sibling_instance.region == "us-east-1" + assert gcp_mock.compute.return_value.get_offers.call_count == 0 + assert gcp_mock.compute.return_value.create_instance.call_count == 0 + assert aws_mock.compute.return_value.create_instance.call_count == 1 + + async def test_non_master_does_not_create_new_placement_group_without_master_pg( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=2, target=2, max=2), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + job_provisioning_data=get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ), + instance_num=0, + ) + sibling_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=1, + ) + await _set_current_master_instance(session, fleet, master_instance) + + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, region="us-east-1") + ] + backend_mock.compute.return_value.is_suitable_placement_group.return_value = True + backend_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, sibling_instance) + + await session.refresh(sibling_instance) + assert sibling_instance.status == InstanceStatus.PROVISIONING + assert backend_mock.compute.return_value.create_placement_group.call_count == 0 + placement_groups = (await session.execute(select(PlacementGroupModel))).scalars().all() + assert len(placement_groups) == 0 + + async def test_non_master_reuses_existing_current_master_placement_group( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=3, target=3, max=3), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + job_provisioning_data=get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ), + instance_num=0, + ) + current_master_pg = await create_placement_group( + session=session, + project=project, + fleet=fleet, + ) + sibling_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=1, + ) + await _set_current_master_instance(session, fleet, master_instance) + + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, region="us-east-1") + ] + backend_mock.compute.return_value.is_suitable_placement_group.return_value = True + backend_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, sibling_instance) + + await session.refresh(sibling_instance) + assert sibling_instance.status == InstanceStatus.PROVISIONING + assert backend_mock.compute.return_value.create_placement_group.call_count == 0 + create_call = backend_mock.compute.return_value.create_instance.call_args + assert create_call is not None + assert create_call.args[2] is not None + assert create_call.args[2].name == current_master_pg.name + placement_groups = (await session.execute(select(PlacementGroupModel))).scalars().all() + assert len(placement_groups) == 1 + + async def test_allows_parallel_processing_after_master_is_provisioned( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=3, target=3, max=3), + ) + ), + ) + master_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.IDLE, + job_provisioning_data=get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ), + instance_num=0, + ) + later_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=2, + ) + earlier_instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + instance_num=1, + ) + await _set_current_master_instance(session, fleet, master_instance) + + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(backend=BackendType.AWS, region="us-east-1") + ] + backend_mock.compute.return_value.create_instance.return_value = get_job_provisioning_data( + backend=BackendType.AWS, + region="us-east-1", + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, later_instance) + assert backend_mock.compute.return_value.create_instance.call_count == 1 + await process_instance(session, worker, earlier_instance) + + await session.refresh(later_instance) + await session.refresh(earlier_instance) + assert later_instance.status == InstanceStatus.PROVISIONING + assert earlier_instance.status == InstanceStatus.PROVISIONING + assert backend_mock.compute.return_value.create_instance.call_count == 2 + + @pytest.mark.parametrize( + ("placement", "should_create"), + [ + pytest.param(InstanceGroupPlacement.CLUSTER, True, id="placement-cluster"), + pytest.param(None, False, id="no-placement"), + ], + ) + async def test_create_placement_group_if_placement_cluster( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + placement: Optional[InstanceGroupPlacement], + should_create: bool, + ) -> None: + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=placement, nodes=FleetNodesSpec(min=1, target=1, max=1) + ) + ), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + ) + if placement == InstanceGroupPlacement.CLUSTER: + await _set_current_master_instance(session, fleet, instance) + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability() + ] + backend_mock.compute.return_value.create_instance.return_value = ( + get_job_provisioning_data() + ) + backend_mock.compute.return_value.create_placement_group.return_value = ( + get_placement_group_provisioning_data() + ) + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PROVISIONING + placement_groups = (await session.execute(select(PlacementGroupModel))).scalars().all() + if should_create: + assert backend_mock.compute.return_value.create_placement_group.call_count == 1 + assert len(placement_groups) == 1 + else: + assert backend_mock.compute.return_value.create_placement_group.call_count == 0 + assert len(placement_groups) == 0 + + @pytest.mark.parametrize("can_reuse", [True, False]) + async def test_reuses_placement_group_between_offers_if_the_group_is_suitable( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + can_reuse: bool, + ) -> None: + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=1, target=1, max=1), + ) + ), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + ) + await _set_current_master_instance(session, fleet, instance) + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(instance_type="bad-offer-1"), + get_instance_offer_with_availability(instance_type="bad-offer-2"), + get_instance_offer_with_availability(instance_type="good-offer"), + ] + + def create_instance_method( + instance_offer: InstanceOfferWithAvailability, *args, **kwargs + ) -> JobProvisioningData: + if instance_offer.instance.name == "good-offer": + return get_job_provisioning_data() + raise NoCapacityError() + + backend_mock.compute.return_value.create_instance = create_instance_method + backend_mock.compute.return_value.create_placement_group.return_value = ( + get_placement_group_provisioning_data() + ) + backend_mock.compute.return_value.is_suitable_placement_group.return_value = can_reuse + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PROVISIONING + placement_groups = (await session.execute(select(PlacementGroupModel))).scalars().all() + if can_reuse: + assert backend_mock.compute.return_value.create_placement_group.call_count == 1 + assert len(placement_groups) == 1 + else: + assert backend_mock.compute.return_value.create_placement_group.call_count == 3 + assert len(placement_groups) == 3 + to_be_deleted_count = sum(pg.fleet_deleted for pg in placement_groups) + assert to_be_deleted_count == 2 + + @pytest.mark.parametrize("err", [NoCapacityError(), RuntimeError()]) + async def test_handles_create_placement_group_errors( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + err: Exception, + ) -> None: + project = await create_project(session=session) + fleet = await create_fleet( + session, + project, + spec=get_fleet_spec( + conf=get_fleet_configuration( + placement=InstanceGroupPlacement.CLUSTER, + nodes=FleetNodesSpec(min=1, target=1, max=1), + ) + ), + ) + instance = await create_instance( + session=session, + project=project, + fleet=fleet, + status=InstanceStatus.PENDING, + offer=None, + job_provisioning_data=None, + ) + await _set_current_master_instance(session, fleet, instance) + backend_mock = Mock() + backend_mock.TYPE = BackendType.AWS + backend_mock.compute.return_value = Mock(spec=ComputeMockSpec) + backend_mock.compute.return_value.get_offers.return_value = [ + get_instance_offer_with_availability(instance_type="bad-offer"), + get_instance_offer_with_availability(instance_type="good-offer"), + ] + backend_mock.compute.return_value.create_instance.return_value = ( + get_job_provisioning_data() + ) + + def create_placement_group_method( + placement_group: PlacementGroup, master_instance_offer: InstanceOffer + ) -> PlacementGroupProvisioningData: + if master_instance_offer.instance.name == "good-offer": + return get_placement_group_provisioning_data() + raise err + + backend_mock.compute.return_value.create_placement_group = create_placement_group_method + with patch("dstack._internal.server.services.backends.get_project_backends") as m: + m.return_value = [backend_mock] + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PROVISIONING + assert instance.offer + assert "good-offer" in instance.offer + assert "bad-offer" not in instance.offer + placement_groups = (await session.execute(select(PlacementGroupModel))).scalars().all() + assert len(placement_groups) == 1 diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_pipeline.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_pipeline.py new file mode 100644 index 0000000000..012c7fdb38 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_pipeline.py @@ -0,0 +1,253 @@ +import datetime as dt +import uuid +from unittest.mock import Mock + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.models.instances import InstanceStatus +from dstack._internal.server.background.pipeline_tasks.instances import ( + InstanceFetcher, + InstancePipeline, + InstanceWorker, +) +from dstack._internal.server.background.pipeline_tasks.instances import check as instances_check +from dstack._internal.server.schemas.instances import InstanceCheck +from dstack._internal.server.testing.common import ( + create_compute_group, + create_fleet, + create_instance, + create_project, +) +from dstack._internal.utils.common import get_current_datetime +from tests._internal.server.background.pipeline_tasks.test_instances.helpers import ( + instance_to_pipeline_item, + lock_instance, + process_instance, +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestInstanceFetcher: + async def test_fetch_selects_eligible_instances_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: InstanceFetcher + ): + project = await create_project(session=session) + fleet = await create_fleet(session=session, project=project) + compute_group = await create_compute_group(session=session, project=project, fleet=fleet) + now = get_current_datetime() + stale = now - dt.timedelta(minutes=1) + + pending = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + last_processed_at=stale - dt.timedelta(seconds=5), + ) + provisioning = await create_instance( + session=session, + project=project, + status=InstanceStatus.PROVISIONING, + name="provisioning", + last_processed_at=stale - dt.timedelta(seconds=4), + ) + busy = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + name="busy", + last_processed_at=stale - dt.timedelta(seconds=3), + ) + idle = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + name="idle", + last_processed_at=stale - dt.timedelta(seconds=2), + ) + terminating = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + name="terminating", + last_processed_at=stale - dt.timedelta(seconds=1), + ) + + deleted = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + name="deleted", + last_processed_at=stale, + ) + deleted.deleted = True + + recent = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + name="recent", + last_processed_at=now, + ) + + terminating_compute_group = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + name="terminating-compute-group", + last_processed_at=stale + dt.timedelta(seconds=1), + ) + terminating_compute_group.compute_group = compute_group + + locked = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + name="locked", + last_processed_at=stale + dt.timedelta(seconds=2), + ) + locked.lock_expires_at = now + dt.timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert {item.id for item in items} == { + pending.id, + provisioning.id, + busy.id, + idle.id, + terminating.id, + } + assert {item.status for item in items} == { + InstanceStatus.PENDING, + InstanceStatus.PROVISIONING, + InstanceStatus.BUSY, + InstanceStatus.IDLE, + InstanceStatus.TERMINATING, + } + + for instance in [ + pending, + provisioning, + busy, + idle, + terminating, + deleted, + recent, + terminating_compute_group, + locked, + ]: + await session.refresh(instance) + + expected_lock_owner = InstancePipeline.__name__ + fetched_instances = [pending, provisioning, busy, idle, terminating] + assert all(instance.lock_owner == expected_lock_owner for instance in fetched_instances) + assert all(instance.lock_expires_at is not None for instance in fetched_instances) + assert all(instance.lock_token is not None for instance in fetched_instances) + assert len({instance.lock_token for instance in fetched_instances}) == 1 + + assert deleted.lock_owner is None + assert recent.lock_owner is None + assert terminating_compute_group.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_respects_order_and_limit( + self, test_db, session: AsyncSession, fetcher: InstanceFetcher + ): + project = await create_project(session=session) + now = get_current_datetime() + + oldest = await create_instance( + session=session, + project=project, + name="oldest", + last_processed_at=now - dt.timedelta(minutes=3), + ) + middle = await create_instance( + session=session, + project=project, + name="middle", + last_processed_at=now - dt.timedelta(minutes=2), + ) + newest = await create_instance( + session=session, + project=project, + name="newest", + last_processed_at=now - dt.timedelta(minutes=1), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == InstancePipeline.__name__ + assert middle.lock_owner == InstancePipeline.__name__ + assert newest.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestInstanceWorker: + async def test_process_skips_when_lock_token_changes( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.IDLE, + ) + + lock_instance(instance) + await session.commit() + item = instance_to_pipeline_item(instance) + new_lock_token = uuid.uuid4() + instance.lock_token = new_lock_token + await session.commit() + + await worker.process(item) + await session.refresh(instance) + + assert instance.lock_token == new_lock_token + assert instance.lock_owner == InstancePipeline.__name__ + + async def test_process_unlocks_and_updates_last_processed_at_after_check( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PROVISIONING, + ) + before_processed_at = instance.last_processed_at + + monkeypatch.setattr( + instances_check, + "_check_instance_inner", + Mock(return_value=InstanceCheck(reachable=True)), + ) + await process_instance(session, worker, instance) + + await session.refresh(instance) + + assert instance.status == InstanceStatus.IDLE + assert instance.lock_expires_at is None + assert instance.lock_token is None + assert instance.lock_owner is None + assert instance.last_processed_at > before_processed_at diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_ssh_deploy.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_ssh_deploy.py new file mode 100644 index 0000000000..c103458ed4 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_ssh_deploy.py @@ -0,0 +1,248 @@ +import datetime as dt +from typing import Optional +from unittest.mock import Mock + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import SSHProvisioningError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.server.background.pipeline_tasks.instances import InstanceWorker +from dstack._internal.server.background.pipeline_tasks.instances import ( + ssh_deploy as instances_ssh_deploy, +) +from dstack._internal.server.testing.common import ( + create_instance, + create_project, + get_job_provisioning_data, + get_remote_connection_info, +) +from dstack._internal.utils.common import get_current_datetime +from tests._internal.server.background.pipeline_tasks.test_instances.helpers import ( + process_instance, +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestSSHDeploy: + async def test_pending_ssh_instance_terminates_on_provision_timeout( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime() - dt.timedelta(days=100), + remote_connection_info=get_remote_connection_info(), + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.PROVISIONING_TIMEOUT + + @pytest.mark.parametrize( + ["cpus", "gpus", "requested_blocks", "expected_blocks"], + [ + pytest.param(32, 8, 1, 1, id="gpu-instance-no-blocks"), + pytest.param(32, 8, 2, 2, id="gpu-instance-four-gpu-per-block"), + pytest.param(32, 8, 4, 4, id="gpu-instance-two-gpus-per-block"), + pytest.param(32, 8, None, 8, id="gpu-instance-auto-max-gpu"), + pytest.param(4, 8, None, 4, id="gpu-instance-auto-max-cpu"), + pytest.param(8, 8, None, 8, id="gpu-instance-auto-max-cpu-and-gpu"), + pytest.param(32, 0, 1, 1, id="cpu-instance-no-blocks"), + pytest.param(32, 0, 2, 2, id="cpu-instance-four-cpu-per-block"), + pytest.param(32, 0, 4, 4, id="cpu-instance-two-cpus-per-block"), + pytest.param(32, 0, None, 32, id="cpu-instance-auto-max-cpu"), + ], + ) + async def test_adds_ssh_instance( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + host_info: dict, + deploy_instance_mock: Mock, + cpus: int, + gpus: int, + requested_blocks: Optional[int], + expected_blocks: int, + ): + host_info["cpus"] = cpus + host_info["gpu_count"] = gpus + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + total_blocks=requested_blocks, + busy_blocks=0, + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.IDLE + assert instance.total_blocks == expected_blocks + assert instance.busy_blocks == 0 + deploy_instance_mock.assert_called_once() + + async def test_retries_ssh_instance_if_provisioning_fails( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + deploy_instance_mock: Mock, + ): + deploy_instance_mock.side_effect = SSHProvisioningError("Expected") + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.PENDING + assert instance.termination_reason is None + + async def test_terminates_ssh_instance_if_deploy_fails_unexpectedly( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + deploy_instance_mock: Mock, + ): + deploy_instance_mock.side_effect = RuntimeError("Unexpected") + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.ERROR + assert instance.termination_reason_message == "Unexpected error when adding SSH instance" + + async def test_terminates_ssh_instance_if_key_is_invalid( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + monkeypatch: pytest.MonkeyPatch, + ): + monkeypatch.setattr( + instances_ssh_deploy, + "ssh_keys_to_pkeys", + Mock(side_effect=ValueError("Bad key")), + ) + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.ERROR + assert instance.termination_reason_message == "Unsupported private SSH key type" + + async def test_terminates_ssh_instance_if_internal_ip_cannot_be_resolved_from_network( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + host_info: dict, + deploy_instance_mock: Mock, + ): + host_info["addresses"] = ["192.168.100.100/24"] + project = await create_project(session=session) + job_provisioning_data = get_job_provisioning_data( + dockerized=True, + backend=BackendType.REMOTE, + internal_ip=None, + ) + job_provisioning_data.instance_network = "10.0.0.0/24" + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + job_provisioning_data=job_provisioning_data, + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.ERROR + assert ( + instance.termination_reason_message + == "Failed to locate internal IP address on the given network" + ) + + async def test_terminates_ssh_instance_if_internal_ip_is_not_in_host_interfaces( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + host_info: dict, + deploy_instance_mock: Mock, + ): + host_info["addresses"] = ["192.168.100.100/24"] + project = await create_project(session=session) + job_provisioning_data = get_job_provisioning_data( + dockerized=True, + backend=BackendType.REMOTE, + internal_ip="10.0.0.20", + ) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + job_provisioning_data=job_provisioning_data, + ) + await session.commit() + + await process_instance(session, worker, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.ERROR + assert ( + instance.termination_reason_message + == "Specified internal IP not found among instance interfaces" + ) diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_termination.py b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_termination.py new file mode 100644 index 0000000000..b9da58fc11 --- /dev/null +++ b/src/tests/_internal/server/background/pipeline_tasks/test_instances/test_termination.py @@ -0,0 +1,219 @@ +import datetime as dt +from contextlib import contextmanager +from typing import Optional +from unittest.mock import AsyncMock, Mock, patch + +import pytest +from freezegun import freeze_time +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.core.errors import BackendError, NotYetTerminated +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import InstanceStatus, InstanceTerminationReason +from dstack._internal.server.background.pipeline_tasks.instances import InstanceWorker +from dstack._internal.server.background.pipeline_tasks.instances import ( + termination as instances_termination, +) +from dstack._internal.server.testing.common import create_instance, create_project +from tests._internal.server.background.pipeline_tasks.test_instances.helpers import ( + instance_to_pipeline_item, + lock_instance, + process_instance, +) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestTermination: + @staticmethod + @contextmanager + def mock_terminate_in_backend(error: Optional[Exception] = None): + backend = Mock() + backend.TYPE = BackendType.VERDA + terminate_instance = backend.compute.return_value.terminate_instance + if error is not None: + terminate_instance.side_effect = error + with patch.object( + instances_termination.backends_services, + "get_project_backend_by_type", + AsyncMock(return_value=backend), + ): + yield terminate_instance + + async def test_terminate( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + ) + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT + instance.last_job_processed_at = dt.datetime.now(dt.timezone.utc) + dt.timedelta( + minutes=-19 + ) + await session.commit() + + with self.mock_terminate_in_backend() as mock: + await process_instance(session, worker, instance) + mock.assert_called_once() + + await session.refresh(instance) + + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.IDLE_TIMEOUT + assert instance.deleted is True + assert instance.deleted_at is not None + assert instance.finished_at is not None + + async def test_terminates_terminating_deleted_instance( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + ) + lock_instance(instance) + await session.commit() + item = instance_to_pipeline_item(instance) + instance.deleted = True + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT + instance.last_job_processed_at = instance.deleted_at = dt.datetime.now( + dt.timezone.utc + ) + dt.timedelta(minutes=-19) + await session.commit() + + with self.mock_terminate_in_backend() as mock: + await worker.process(item) + mock.assert_called_once() + + await session.refresh(instance) + + assert instance.status == InstanceStatus.TERMINATED + assert instance.deleted is True + assert instance.deleted_at is not None + assert instance.finished_at is not None + + @pytest.mark.parametrize( + "error", [BackendError("err"), RuntimeError("err"), NotYetTerminated("")] + ) + async def test_terminate_retry( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + error: Exception, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + ) + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT + initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc) + instance.last_job_processed_at = initial_time + instance.last_processed_at = initial_time - dt.timedelta(minutes=1) + await session.commit() + + with ( + freeze_time(initial_time + dt.timedelta(minutes=1)), + self.mock_terminate_in_backend(error=error) as mock, + ): + await process_instance(session, worker, instance) + mock.assert_called_once() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATING + + with ( + freeze_time(initial_time + dt.timedelta(minutes=2)), + self.mock_terminate_in_backend(error=None) as mock, + ): + await process_instance(session, worker, instance) + mock.assert_called_once() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + + async def test_terminate_not_retries_if_too_early( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + ) + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT + initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc) + instance.last_job_processed_at = initial_time + instance.last_processed_at = initial_time - dt.timedelta(minutes=1) + await session.commit() + + with ( + freeze_time(initial_time + dt.timedelta(minutes=1)), + self.mock_terminate_in_backend(error=BackendError("err")) as mock, + ): + await process_instance(session, worker, instance) + mock.assert_called_once() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATING + + instance.last_processed_at = initial_time + await session.commit() + + with ( + freeze_time(initial_time + dt.timedelta(minutes=1, seconds=11)), + self.mock_terminate_in_backend(error=None) as mock, + ): + await process_instance(session, worker, instance) + mock.assert_not_called() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATING + + async def test_terminate_on_termination_deadline( + self, + test_db, + session: AsyncSession, + worker: InstanceWorker, + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.TERMINATING, + ) + instance.termination_reason = InstanceTerminationReason.IDLE_TIMEOUT + initial_time = dt.datetime(2025, 1, 1, tzinfo=dt.timezone.utc) + instance.last_job_processed_at = initial_time + instance.last_processed_at = initial_time - dt.timedelta(minutes=1) + await session.commit() + + with ( + freeze_time(initial_time + dt.timedelta(minutes=1)), + self.mock_terminate_in_backend(error=BackendError("err")) as mock, + ): + await process_instance(session, worker, instance) + mock.assert_called_once() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATING + + with ( + freeze_time(initial_time + dt.timedelta(minutes=15, seconds=55)), + self.mock_terminate_in_backend(error=None) as mock, + ): + await process_instance(session, worker, instance) + mock.assert_called_once() + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py b/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py index c23d5e604d..90c8e75194 100644 --- a/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py +++ b/src/tests/_internal/server/background/pipeline_tasks/test_placement_groups.py @@ -1,5 +1,6 @@ +import asyncio import uuid -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from unittest.mock import Mock, patch import pytest @@ -7,7 +8,11 @@ from dstack._internal.core.errors import PlacementGroupInUseError from dstack._internal.server.background.pipeline_tasks.base import PipelineItem -from dstack._internal.server.background.pipeline_tasks.placement_groups import PlacementGroupWorker +from dstack._internal.server.background.pipeline_tasks.placement_groups import ( + PlacementGroupFetcher, + PlacementGroupPipeline, + PlacementGroupWorker, +) from dstack._internal.server.models import PlacementGroupModel from dstack._internal.server.testing.common import ( ComputeMockSpec, @@ -15,6 +20,7 @@ create_placement_group, create_project, ) +from dstack._internal.utils.common import get_current_datetime @pytest.fixture @@ -22,6 +28,17 @@ def worker() -> PlacementGroupWorker: return PlacementGroupWorker(queue=Mock(), heartbeater=Mock()) +@pytest.fixture +def fetcher() -> PlacementGroupFetcher: + return PlacementGroupFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=15), + lock_timeout=timedelta(seconds=30), + heartbeater=Mock(), + ) + + def _placement_group_to_pipeline_item(placement_group: PlacementGroupModel) -> PipelineItem: assert placement_group.lock_token is not None assert placement_group.lock_expires_at is not None @@ -34,9 +51,133 @@ def _placement_group_to_pipeline_item(placement_group: PlacementGroupModel) -> P ) +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestPlacementGroupFetcher: + async def test_fetch_selects_eligible_placement_groups_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: PlacementGroupFetcher + ): + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + eligible = await create_placement_group( + session=session, + project=project, + fleet=fleet, + fleet_deleted=True, + ) + eligible.last_processed_at = stale - timedelta(seconds=2) + + fleet_not_deleted = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="fleet-not-deleted", + fleet_deleted=False, + ) + fleet_not_deleted.last_processed_at = stale - timedelta(seconds=1) + + deleted = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="deleted", + fleet_deleted=True, + deleted=True, + ) + deleted.last_processed_at = stale + + recent = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="recent", + fleet_deleted=True, + ) + recent.last_processed_at = now + + locked = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="locked", + fleet_deleted=True, + ) + locked.last_processed_at = stale + timedelta(seconds=1) + locked.lock_expires_at = now + timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert [item.id for item in items] == [eligible.id] + + for placement_group in [eligible, fleet_not_deleted, deleted, recent, locked]: + await session.refresh(placement_group) + + assert eligible.lock_owner == PlacementGroupPipeline.__name__ + assert eligible.lock_expires_at is not None + assert eligible.lock_token is not None + + assert fleet_not_deleted.lock_owner is None + assert deleted.lock_owner is None + assert recent.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_returns_oldest_placement_groups_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: PlacementGroupFetcher + ): + project = await create_project(session) + fleet = await create_fleet(session=session, project=project) + now = get_current_datetime() + + oldest = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="oldest", + fleet_deleted=True, + ) + oldest.last_processed_at = now - timedelta(minutes=3) + + middle = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="middle", + fleet_deleted=True, + ) + middle.last_processed_at = now - timedelta(minutes=2) + + newest = await create_placement_group( + session=session, + project=project, + fleet=fleet, + name="newest", + fleet_deleted=True, + ) + newest.last_processed_at = now - timedelta(minutes=1) + await session.commit() + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == PlacementGroupPipeline.__name__ + assert middle.lock_owner == PlacementGroupPipeline.__name__ + assert newest.lock_owner is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) class TestPlacementGroupWorker: - @pytest.mark.asyncio - @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_deletes_placement_group( self, test_db, session: AsyncSession, worker: PlacementGroupWorker ): @@ -64,8 +205,6 @@ async def test_deletes_placement_group( await session.refresh(placement_group) assert placement_group.deleted - @pytest.mark.asyncio - @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_retries_placement_group_deletion_if_still_in_use( self, test_db, session: AsyncSession, worker: PlacementGroupWorker ): diff --git a/src/tests/_internal/server/background/pipeline_tasks/test_volumes.py b/src/tests/_internal/server/background/pipeline_tasks/test_volumes.py index 4d22c59b97..63dfaaa45a 100644 --- a/src/tests/_internal/server/background/pipeline_tasks/test_volumes.py +++ b/src/tests/_internal/server/background/pipeline_tasks/test_volumes.py @@ -1,5 +1,6 @@ +import asyncio import uuid -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from unittest.mock import Mock, patch import pytest @@ -9,6 +10,8 @@ from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.volumes import VolumeProvisioningData, VolumeStatus from dstack._internal.server.background.pipeline_tasks.volumes import ( + VolumeFetcher, + VolumePipeline, VolumePipelineItem, VolumeWorker, ) @@ -22,6 +25,7 @@ get_volume_provisioning_data, list_events, ) +from dstack._internal.utils.common import get_current_datetime @pytest.fixture @@ -29,6 +33,17 @@ def worker() -> VolumeWorker: return VolumeWorker(queue=Mock(), heartbeater=Mock()) +@pytest.fixture +def fetcher() -> VolumeFetcher: + return VolumeFetcher( + queue=asyncio.Queue(), + queue_desired_minsize=1, + min_processing_interval=timedelta(seconds=15), + lock_timeout=timedelta(seconds=30), + heartbeater=Mock(), + ) + + def _volume_to_pipeline_item(volume_model: VolumeModel) -> VolumePipelineItem: assert volume_model.lock_token is not None assert volume_model.lock_expires_at is not None @@ -43,6 +58,145 @@ def _volume_to_pipeline_item(volume_model: VolumeModel) -> VolumePipelineItem: ) +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestVolumeFetcher: + async def test_fetch_selects_eligible_volumes_and_sets_lock_fields( + self, test_db, session: AsyncSession, fetcher: VolumeFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + now = get_current_datetime() + stale = now - timedelta(minutes=1) + + submitted = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=stale - timedelta(minutes=1), + last_processed_at=stale - timedelta(seconds=2), + ) + to_be_deleted = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.ACTIVE, + created_at=stale - timedelta(minutes=1), + last_processed_at=stale - timedelta(seconds=1), + ) + to_be_deleted.to_be_deleted = True + + just_created = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=now, + last_processed_at=now, + ) + + deleted = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=stale - timedelta(minutes=1), + last_processed_at=stale, + deleted_at=stale, + ) + recent = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=now - timedelta(minutes=2), + last_processed_at=now, + ) + locked = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=stale - timedelta(minutes=1), + last_processed_at=stale + timedelta(seconds=1), + ) + locked.lock_expires_at = now + timedelta(minutes=1) + locked.lock_token = uuid.uuid4() + locked.lock_owner = "OtherPipeline" + await session.commit() + + items = await fetcher.fetch(limit=10) + + assert {item.id for item in items} == { + submitted.id, + to_be_deleted.id, + just_created.id, + } + assert {(item.id, item.status, item.to_be_deleted) for item in items} == { + (submitted.id, VolumeStatus.SUBMITTED, False), + (to_be_deleted.id, VolumeStatus.ACTIVE, True), + (just_created.id, VolumeStatus.SUBMITTED, False), + } + + for volume in [submitted, to_be_deleted, just_created, deleted, recent, locked]: + await session.refresh(volume) + + fetched_volumes = [submitted, to_be_deleted, just_created] + assert all(volume.lock_owner == VolumePipeline.__name__ for volume in fetched_volumes) + assert all(volume.lock_expires_at is not None for volume in fetched_volumes) + assert all(volume.lock_token is not None for volume in fetched_volumes) + assert len({volume.lock_token for volume in fetched_volumes}) == 1 + + assert deleted.lock_owner is None + assert recent.lock_owner is None + assert locked.lock_owner == "OtherPipeline" + + async def test_fetch_returns_oldest_volumes_first_up_to_limit( + self, test_db, session: AsyncSession, fetcher: VolumeFetcher + ): + project = await create_project(session=session) + user = await create_user(session=session) + now = get_current_datetime() + + oldest = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=now - timedelta(minutes=4), + last_processed_at=now - timedelta(minutes=3), + ) + middle = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=now - timedelta(minutes=3), + last_processed_at=now - timedelta(minutes=2), + ) + newest = await create_volume( + session=session, + project=project, + user=user, + status=VolumeStatus.SUBMITTED, + created_at=now - timedelta(minutes=2), + last_processed_at=now - timedelta(minutes=1), + ) + + items = await fetcher.fetch(limit=2) + + assert [item.id for item in items] == [oldest.id, middle.id] + + await session.refresh(oldest) + await session.refresh(middle) + await session.refresh(newest) + + assert oldest.lock_owner == VolumePipeline.__name__ + assert middle.lock_owner == VolumePipeline.__name__ + assert newest.lock_owner is None + + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) class TestVolumeWorkerSubmitted: diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_instance_healthchecks.py b/src/tests/_internal/server/background/scheduled_tasks/test_instance_healthchecks.py new file mode 100644 index 0000000000..06ea5ab5ac --- /dev/null +++ b/src/tests/_internal/server/background/scheduled_tasks/test_instance_healthchecks.py @@ -0,0 +1,49 @@ +from datetime import timedelta + +import pytest +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from dstack._internal.server.background.scheduled_tasks.instance_healthchecks import ( + delete_instance_healthchecks, +) +from dstack._internal.server.models import InstanceHealthCheckModel, InstanceStatus +from dstack._internal.server.testing.common import ( + create_instance, + create_instance_health_check, + create_project, +) +from dstack._internal.utils.common import get_current_datetime + + +@pytest.mark.asyncio +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +@pytest.mark.usefixtures("test_db", "image_config_mock") +class TestDeleteInstanceHealthChecks: + async def test_deletes_instance_health_checks( + self, monkeypatch: pytest.MonkeyPatch, session: AsyncSession + ): + project = await create_project(session=session) + instance = await create_instance( + session=session, project=project, status=InstanceStatus.IDLE + ) + # 30 minutes + monkeypatch.setattr( + "dstack._internal.server.settings.SERVER_INSTANCE_HEALTH_TTL_SECONDS", 1800 + ) + now = get_current_datetime() + # old check + await create_instance_health_check( + session=session, instance=instance, collected_at=now - timedelta(minutes=40) + ) + # recent check + check = await create_instance_health_check( + session=session, instance=instance, collected_at=now - timedelta(minutes=20) + ) + + await delete_instance_healthchecks() + + res = await session.execute(select(InstanceHealthCheckModel)) + all_checks = res.scalars().all() + assert len(all_checks) == 1 + assert all_checks[0] == check diff --git a/src/tests/_internal/server/background/scheduled_tasks/test_instances.py b/src/tests/_internal/server/background/scheduled_tasks/test_instances.py index 1b9789953e..88e4acc949 100644 --- a/src/tests/_internal/server/background/scheduled_tasks/test_instances.py +++ b/src/tests/_internal/server/background/scheduled_tasks/test_instances.py @@ -19,6 +19,7 @@ NoCapacityError, NotYetTerminated, ProvisioningError, + SSHProvisioningError, ) from dstack._internal.core.models.backends.base import BackendType from dstack._internal.core.models.fleets import InstanceGroupPlacement @@ -40,7 +41,6 @@ JobStatus, ) from dstack._internal.server.background.scheduled_tasks.instances import ( - delete_instance_health_checks, process_instances, ) from dstack._internal.server.models import ( @@ -65,7 +65,6 @@ ComputeMockSpec, create_fleet, create_instance, - create_instance_health_check, create_job, create_project, create_repo, @@ -1206,38 +1205,141 @@ async def test_adds_ssh_instance( assert instance.total_blocks == expected_blocks assert instance.busy_blocks == 0 + async def test_retries_ssh_instance_if_provisioning_fails( + self, + session: AsyncSession, + deploy_instance_mock: Mock, + ): + deploy_instance_mock.side_effect = SSHProvisioningError("Expected") + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + ) + await session.commit() -@pytest.mark.asyncio -@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) -@pytest.mark.usefixtures("test_db", "image_config_mock") -class TestDeleteInstanceHealthChecks: - async def test_deletes_instance_health_checks( - self, monkeypatch: pytest.MonkeyPatch, session: AsyncSession + await process_instances() + + await session.refresh(instance) + assert instance.status == InstanceStatus.PENDING + assert instance.termination_reason is None + + async def test_terminates_ssh_instance_if_deploy_fails_unexpectedly( + self, + session: AsyncSession, + deploy_instance_mock: Mock, ): + deploy_instance_mock.side_effect = RuntimeError("Unexpected") project = await create_project(session=session) instance = await create_instance( - session=session, project=project, status=InstanceStatus.IDLE + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), ) - # 30 minutes + await session.commit() + + await process_instances() + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.ERROR + assert instance.termination_reason_message == "Unexpected error when adding SSH instance" + + async def test_terminates_ssh_instance_if_key_is_invalid( + self, + session: AsyncSession, + monkeypatch: pytest.MonkeyPatch, + ): monkeypatch.setattr( - "dstack._internal.server.settings.SERVER_INSTANCE_HEALTH_TTL_SECONDS", 1800 + "dstack._internal.server.background.scheduled_tasks.instances._ssh_keys_to_pkeys", + Mock(side_effect=ValueError("Bad key")), + ) + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + ) + await session.commit() + + await process_instances() + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.ERROR + assert instance.termination_reason_message == "Unsupported private SSH key type" + + async def test_terminates_ssh_instance_if_internal_ip_cannot_be_resolved_from_network( + self, + session: AsyncSession, + host_info: dict, + ): + host_info["addresses"] = ["192.168.100.100/24"] + project = await create_project(session=session) + job_provisioning_data = get_job_provisioning_data( + dockerized=True, + backend=BackendType.REMOTE, + internal_ip=None, + ) + job_provisioning_data.instance_network = "10.0.0.0/24" + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + job_provisioning_data=job_provisioning_data, + ) + await session.commit() + + await process_instances() + + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.ERROR + assert ( + instance.termination_reason_message + == "Failed to locate internal IP address on the given network" ) - now = get_current_datetime() - # old check - await create_instance_health_check( - session=session, instance=instance, collected_at=now - dt.timedelta(minutes=40) + + async def test_terminates_ssh_instance_if_internal_ip_is_not_in_host_interfaces( + self, + session: AsyncSession, + host_info: dict, + ): + host_info["addresses"] = ["192.168.100.100/24"] + project = await create_project(session=session) + job_provisioning_data = get_job_provisioning_data( + dockerized=True, + backend=BackendType.REMOTE, + internal_ip="10.0.0.20", ) - # recent check - check = await create_instance_health_check( - session=session, instance=instance, collected_at=now - dt.timedelta(minutes=20) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.PENDING, + created_at=get_current_datetime(), + remote_connection_info=get_remote_connection_info(), + job_provisioning_data=job_provisioning_data, ) + await session.commit() - await delete_instance_health_checks() + await process_instances() - res = await session.execute(select(InstanceHealthCheckModel)) - all_checks = res.scalars().all() - assert len(all_checks) == 1 - assert all_checks[0] == check + await session.refresh(instance) + assert instance.status == InstanceStatus.TERMINATED + assert instance.termination_reason == InstanceTerminationReason.ERROR + assert ( + instance.termination_reason_message + == "Specified internal IP not found among instance interfaces" + ) @pytest.mark.asyncio diff --git a/src/tests/_internal/server/routers/test_fleets.py b/src/tests/_internal/server/routers/test_fleets.py index 12108eed31..d14e74e80d 100644 --- a/src/tests/_internal/server/routers/test_fleets.py +++ b/src/tests/_internal/server/routers/test_fleets.py @@ -1659,6 +1659,84 @@ async def test_terminates_fleet_instances( assert instance2.status != InstanceStatus.TERMINATING assert fleet.status != FleetStatus.TERMINATING + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_ignores_lock_on_non_selected_instances( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + instance1 = await create_instance( + session=session, + project=project, + instance_num=1, + ) + instance2 = await create_instance( + session=session, + project=project, + instance_num=2, + ) + fleet.instances.append(instance1) + fleet.instances.append(instance2) + instance2.lock_expires_at = datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc) + await session.commit() + + response = await client.post( + f"/api/project/{project.name}/fleets/delete_instances", + headers=get_auth_headers(user.token), + json={"name": fleet.name, "instance_nums": [1]}, + ) + assert response.status_code == 200 + await session.refresh(fleet) + await session.refresh(instance1) + await session.refresh(instance2) + assert instance1.status == InstanceStatus.TERMINATING + assert instance2.status != InstanceStatus.TERMINATING + assert fleet.status != FleetStatus.TERMINATING + + @pytest.mark.asyncio + @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) + async def test_returns_400_when_selected_instance_locked( + self, test_db, session: AsyncSession, client: AsyncClient + ): + user = await create_user(session, global_role=GlobalRole.USER) + project = await create_project(session) + await add_project_member( + session=session, project=project, user=user, project_role=ProjectRole.USER + ) + fleet = await create_fleet(session=session, project=project) + instance1 = await create_instance( + session=session, + project=project, + instance_num=1, + ) + instance2 = await create_instance( + session=session, + project=project, + instance_num=2, + ) + fleet.instances.append(instance1) + fleet.instances.append(instance2) + instance1.lock_expires_at = datetime(2023, 1, 2, 3, 5, tzinfo=timezone.utc) + await session.commit() + + response = await client.post( + f"/api/project/{project.name}/fleets/delete_instances", + headers=get_auth_headers(user.token), + json={"name": fleet.name, "instance_nums": [1]}, + ) + assert response.status_code == 400 + await session.refresh(fleet) + await session.refresh(instance1) + await session.refresh(instance2) + assert instance1.status != InstanceStatus.TERMINATING + assert instance2.status != InstanceStatus.TERMINATING + assert fleet.status != FleetStatus.TERMINATING + @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) async def test_returns_400_when_deleting_busy_instances( diff --git a/src/tests/_internal/server/services/test_instances.py b/src/tests/_internal/server/services/test_instances.py index 4883e309cc..0eef682003 100644 --- a/src/tests/_internal/server/services/test_instances.py +++ b/src/tests/_internal/server/services/test_instances.py @@ -1,4 +1,5 @@ import uuid +from unittest.mock import Mock, call import pytest from sqlalchemy.ext.asyncio import AsyncSession @@ -14,10 +15,16 @@ Resources, ) from dstack._internal.core.models.profiles import Profile +from dstack._internal.core.models.runs import JobStatus from dstack._internal.server.models import InstanceModel +from dstack._internal.server.schemas.runner import TaskListItem, TaskListResponse, TaskStatus +from dstack._internal.server.services.runner.client import ShimClient from dstack._internal.server.testing.common import ( create_instance, + create_job, create_project, + create_repo, + create_run, create_user, get_volume, get_volume_configuration, @@ -155,6 +162,117 @@ async def test_returns_volume_instances(self, test_db, session: AsyncSession): assert res == [runpod_instance2] +@pytest.mark.asyncio +@pytest.mark.usefixtures("image_config_mock") +@pytest.mark.usefixtures("turn_off_keep_shim_tasks_setting") +@pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) +class TestRemoveDanglingTasks: + @pytest.fixture + def turn_off_keep_shim_tasks_setting(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("dstack._internal.server.settings.SERVER_KEEP_SHIM_TASKS", False) + + async def test_terminates_and_removes_dangling_tasks( + self, test_db, session: AsyncSession + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + instance=instance, + ) + dangling_task_id_1 = "fe138b77-d0b1-49d3-8c9f-2dfe78ece727" + dangling_task_id_2 = "8b016a75-41de-44f1-91ff-c9b63d2caa1d" + shim_client_mock = Mock(spec_set=ShimClient) + shim_client_mock.is_api_v2_supported.return_value = True + shim_client_mock.list_tasks.return_value = TaskListResponse( + tasks=[ + TaskListItem(id=str(job.id), status=TaskStatus.RUNNING), + TaskListItem(id=dangling_task_id_1, status=TaskStatus.RUNNING), + TaskListItem(id=dangling_task_id_2, status=TaskStatus.TERMINATED), + ] + ) + await session.refresh(instance, attribute_names=["jobs"]) + + instances_services.remove_dangling_tasks_from_instance(shim_client_mock, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.BUSY + + shim_client_mock.terminate_task.assert_called_once_with( + task_id=dangling_task_id_1, + reason=None, + message=None, + timeout=0, + ) + assert shim_client_mock.remove_task.call_count == 2 + shim_client_mock.remove_task.assert_has_calls( + [call(task_id=dangling_task_id_1), call(task_id=dangling_task_id_2)] + ) + + async def test_terminates_and_removes_dangling_tasks_legacy_shim( + self, test_db, session: AsyncSession + ) -> None: + user = await create_user(session=session) + project = await create_project(session=session) + instance = await create_instance( + session=session, + project=project, + status=InstanceStatus.BUSY, + ) + repo = await create_repo(session=session, project_id=project.id) + run = await create_run( + session=session, + project=project, + repo=repo, + user=user, + ) + job = await create_job( + session=session, + run=run, + status=JobStatus.RUNNING, + instance=instance, + ) + dangling_task_id_1 = "fe138b77-d0b1-49d3-8c9f-2dfe78ece727" + dangling_task_id_2 = "8b016a75-41de-44f1-91ff-c9b63d2caa1d" + shim_client_mock = Mock(spec_set=ShimClient) + shim_client_mock.is_api_v2_supported.return_value = True + shim_client_mock.list_tasks.return_value = TaskListResponse( + ids=[str(job.id), dangling_task_id_1, dangling_task_id_2] + ) + await session.refresh(instance, attribute_names=["jobs"]) + + instances_services.remove_dangling_tasks_from_instance(shim_client_mock, instance) + + await session.refresh(instance) + assert instance.status == InstanceStatus.BUSY + + assert shim_client_mock.terminate_task.call_count == 2 + shim_client_mock.terminate_task.assert_has_calls( + [ + call(task_id=dangling_task_id_1, reason=None, message=None, timeout=0), + call(task_id=dangling_task_id_2, reason=None, message=None, timeout=0), + ] + ) + assert shim_client_mock.remove_task.call_count == 2 + shim_client_mock.remove_task.assert_has_calls( + [call(task_id=dangling_task_id_1), call(task_id=dangling_task_id_2)] + ) + + class TestInstanceModelToInstance: @pytest.mark.asyncio @pytest.mark.parametrize("test_db", ["sqlite", "postgres"], indirect=True) From 9ceafaba0430c63568164d15734b3fdba6113143 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 6 Mar 2026 15:58:09 +0500 Subject: [PATCH 187/187] Migrate attribute comments to docstrings (#3639) * Migrate attribute comments to docstrings * Minor fixes * Update AGENTS.md * Remove leading enwlines --- AGENTS.md | 1 + .../_internal/core/models/backends/base.py | 5 +- src/dstack/_internal/core/models/common.py | 6 +- .../_internal/core/models/compute_groups.py | 8 +- src/dstack/_internal/core/models/config.py | 5 +- .../_internal/core/models/configurations.py | 11 +- src/dstack/_internal/core/models/fleets.py | 14 +- src/dstack/_internal/core/models/gateways.py | 28 ++- src/dstack/_internal/core/models/instances.py | 28 ++- src/dstack/_internal/core/models/placement.py | 3 +- .../_internal/core/models/repos/remote.py | 3 +- src/dstack/_internal/core/models/resources.py | 4 +- src/dstack/_internal/core/models/runs.py | 129 ++++++++----- src/dstack/_internal/core/models/volumes.py | 20 +- src/dstack/_internal/server/models.py | 174 ++++++++++-------- .../_internal/server/schemas/health/dcgm.py | 11 +- src/dstack/_internal/server/schemas/runner.py | 23 ++- 17 files changed, 288 insertions(+), 185 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 336b97bb5b..bb1a7aac0f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -20,6 +20,7 @@ - Keep primary/public functions before local helper functions in a module section. - Keep private classes, exceptions, and similar implementation-specific types close to the private functions that use them unless they are shared more broadly in the module. - Prefer pydantic-style models in `core/models`. +- Document attributes when the note adds behavior, compatibility, or semantic context that is not obvious from the name and type. Use attribute docstrings without leading newline. - Tests use `test_*.py` modules and `test_*` functions; fixtures live near usage. ## Testing Guidelines diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index ba382a0b66..2e8eb898ee 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -32,7 +32,8 @@ class BackendType(str, enum.Enum): CLOUDRIFT = "cloudrift" CRUSOE = "crusoe" CUDO = "cudo" - DATACRUNCH = "datacrunch" # BackendType for backward compatibility + DATACRUNCH = "datacrunch" + """`DATACRUNCH` is kept as a `BackendType` for backward compatibility.""" DIGITALOCEAN = "digitalocean" DSTACK = "dstack" GCP = "gcp" @@ -40,7 +41,7 @@ class BackendType(str, enum.Enum): KUBERNETES = "kubernetes" LAMBDA = "lambda" LOCAL = "local" - REMOTE = "remote" # TODO: replace for LOCAL + REMOTE = "remote" NEBIUS = "nebius" OCI = "oci" RUNPOD = "runpod" diff --git a/src/dstack/_internal/core/models/common.py b/src/dstack/_internal/core/models/common.py index 6fcc6d0392..f55a032ba5 100644 --- a/src/dstack/_internal/core/models/common.py +++ b/src/dstack/_internal/core/models/common.py @@ -134,8 +134,10 @@ class RegistryAuth(FrozenCoreModel): class ApplyAction(str, Enum): - CREATE = "create" # resource is to be created or overridden - UPDATE = "update" # resource is to be updated in-place + CREATE = "create" + """`CREATE` means the resource is to be created or overridden.""" + UPDATE = "update" + """`UPDATE` means the resource is to be updated in-place.""" class NetworkMode(str, Enum): diff --git a/src/dstack/_internal/core/models/compute_groups.py b/src/dstack/_internal/core/models/compute_groups.py index 3fa967494d..55dc0d2385 100644 --- a/src/dstack/_internal/core/models/compute_groups.py +++ b/src/dstack/_internal/core/models/compute_groups.py @@ -24,12 +24,14 @@ class ComputeGroupProvisioningData(CoreModel): compute_group_id: str compute_group_name: str backend: BackendType - # In case backend provisions instance in another backend, - # it may set that backend as base_backend. base_backend: Optional[BackendType] = None + """`base_backend` may be set when a backend provisions an instance in another backend and needs + to record that backend as `base_backend`. + """ region: str job_provisioning_datas: List[JobProvisioningData] - backend_data: Optional[str] = None # backend-specific data in json + backend_data: Optional[str] = None + """`backend_data` stores backend-specific data in JSON.""" class ComputeGroup(CoreModel): diff --git a/src/dstack/_internal/core/models/config.py b/src/dstack/_internal/core/models/config.py index c6d0916672..a0497401d9 100644 --- a/src/dstack/_internal/core/models/config.py +++ b/src/dstack/_internal/core/models/config.py @@ -23,6 +23,7 @@ class RepoConfig(CoreModel): class GlobalConfig(CoreModel): projects: Annotated[List[ProjectConfig], Field(description="The list of projects")] = [] - # Not used since 0.20.0. Can be removed when most users update their `config.yml` (it's updated - # each time a project is added) repos: Annotated[list[RepoConfig], Field(exclude=True)] = [] + """`repos` is not used since 0.20.0. It can be removed when most users update their `config.yml` + because it is updated each time a project is added. + """ diff --git a/src/dstack/_internal/core/models/configurations.py b/src/dstack/_internal/core/models/configurations.py index 8c86fd5bd9..ac8d8d172b 100644 --- a/src/dstack/_internal/core/models/configurations.py +++ b/src/dstack/_internal/core/models/configurations.py @@ -101,10 +101,10 @@ def parse(cls, v: str) -> "PortMapping": class RepoExistsAction(str, Enum): - # Don't try to check out, terminate the run with an error (the default action since 0.20.0) ERROR = "error" - # Don't try to check out, skip the repo (the logic hardcoded in the pre-0.20.0 runner) + """`ERROR` means do not try to check out and terminate the run with an error. This is the default action since 0.20.0.""" SKIP = "skip" + """`SKIP` means do not try to check out and skip the repo. This is the logic hardcoded in the pre-0.20.0 runner.""" class RepoSpec(CoreModel): @@ -469,8 +469,8 @@ class BaseRunConfiguration(CoreModel): ), ), ] = None - # deprecated since 0.18.31; has no effect home_dir: str = "/root" + """`home_dir` is deprecated since 0.18.31 and has no effect.""" registry_auth: Annotated[ Optional[RegistryAuth], Field(description="Credentials for pulling a private Docker image") ] = None @@ -540,8 +540,11 @@ class BaseRunConfiguration(CoreModel): list[FilePathMapping], Field(description="The local to container file path mappings"), ] = [] - # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init` setup: CommandsList = [] + """ + setup: Deprecated since 0.18.31. It has no effect for tasks and services; for + dev environments it runs right before `init`. + """ @validator("python", pre=True, always=True) def convert_python(cls, v, values) -> Optional[PythonVersion]: diff --git a/src/dstack/_internal/core/models/fleets.py b/src/dstack/_internal/core/models/fleets.py index c88f606640..a56296bbad 100644 --- a/src/dstack/_internal/core/models/fleets.py +++ b/src/dstack/_internal/core/models/fleets.py @@ -30,8 +30,8 @@ class FleetStatus(str, Enum): - # Currently all fleets are ACTIVE/TERMINATING/TERMINATED - # SUBMITTED/FAILED may be used if fleets require async processing + # Currently all fleets are ACTIVE, TERMINATING, or TERMINATED. + # SUBMITTED and FAILED may be used if fleets require async processing. SUBMITTED = "submitted" ACTIVE = "active" TERMINATING = "terminating" @@ -372,10 +372,11 @@ class FleetSpec(generate_dual_core_model(FleetSpecConfig)): configuration_path: Optional[str] = None profile: Profile autocreated: bool = False - # merged_profile stores profile parameters merged from profile and configuration. - # Read profile parameters from merged_profile instead of profile directly. - # TODO: make merged_profile a computed field after migrating to pydanticV2 + # TODO: make `merged_profile` a computed field after migrating to Pydantic v2. merged_profile: Annotated[Profile, Field(exclude=True)] = None + """`merged_profile` stores profile parameters merged from `profile` and `configuration`. + Read profile parameters from `merged_profile` instead of `profile` directly. + """ @root_validator def _merged_profile(cls, values) -> Dict: @@ -416,7 +417,8 @@ class FleetPlan(CoreModel): offers: List[InstanceOfferWithAvailability] total_offers: int max_offer_price: Optional[float] = None - action: Optional[ApplyAction] = None # default value for backward compatibility + action: Optional[ApplyAction] = None + """`action` uses a default value for backward compatibility.""" def get_effective_spec(self) -> FleetSpec: if self.effective_spec is not None: diff --git a/src/dstack/_internal/core/models/gateways.py b/src/dstack/_internal/core/models/gateways.py index 7f09d3df18..b3fbadb844 100644 --- a/src/dstack/_internal/core/models/gateways.py +++ b/src/dstack/_internal/core/models/gateways.py @@ -102,27 +102,33 @@ class GatewaySpec(CoreModel): class Gateway(CoreModel): - # ID is only optional on the client side for compatibility with pre-0.20.7 servers. - # TODO(0.21): Make required. + # TODO(0.21): Make `id` required. id: Optional[uuid.UUID] = None + """`id` is only optional on the client side for compatibility with pre-0.20.7 servers.""" name: str configuration: GatewayConfiguration created_at: datetime.datetime status: GatewayStatus status_message: Optional[str] - # The ip address / hostname the user should set up the domain for. - # Could be the same as ip_address but also different, e.g. gateway behind ALB. hostname: Optional[str] - # The ip address of the gateway instance + """`hostname` is the IP address or hostname the user should set up the domain for. + Could be the same as `ip_address` but also different, for example a gateway behind ALB. + """ ip_address: Optional[str] + """`ip_address` is the IP address of the gateway instance.""" instance_id: Optional[str] wildcard_domain: Optional[str] default: bool - # TODO: Deprecated configuration fields duplicated on top-level - # for backward compatibility with 0.19.x clients that expect them required. - # Remove after 0.21 backend: Optional[BackendType] = None + """`backend` duplicates a configuration field on the top level for backward compatibility + with 0.19.x clients that expect it to be required. + Remove after 0.21. + """ region: Optional[str] = None + """`region` duplicates a configuration field on the top level for backward compatibility + with 0.19.x clients that expect it to be required. + Remove after 0.21. + """ class GatewayPlan(CoreModel): @@ -147,8 +153,10 @@ class GatewayComputeConfiguration(CoreModel): class GatewayProvisioningData(CoreModel): instance_id: str - ip_address: str # TODO: rename, Kubernetes uses domain names + # TODO: rename `ip_address`; Kubernetes uses domain names here. + ip_address: str region: str availability_zone: Optional[str] = None hostname: Optional[str] = None - backend_data: Optional[str] = None # backend-specific data in json + backend_data: Optional[str] = None + """`backend_data` stores backend-specific data in JSON.""" diff --git a/src/dstack/_internal/core/models/instances.py b/src/dstack/_internal/core/models/instances.py index 7eccee8b69..11a1aca518 100644 --- a/src/dstack/_internal/core/models/instances.py +++ b/src/dstack/_internal/core/models/instances.py @@ -23,9 +23,10 @@ class Gpu(CoreModel): name: str memory_mib: int - # Although it's declared as Optional, in fact it always has a value set by the root validator, - # that is, `assert gpu.vendor is not None` should be a safe type narrowing. vendor: Optional[gpuhunt.AcceleratorVendor] = None + """`vendor` is declared as optional, but the root validator always sets a value. + `assert gpu.vendor is not None` should be a safe type narrowing. + """ @root_validator(pre=True) def validate_name_and_vendor(cls, values): @@ -54,13 +55,15 @@ class Resources(CoreModel): memory_mib: int gpus: List[Gpu] spot: bool - disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility + disk: Disk = Disk(size_mib=102400) + """`disk` defaults to 100GB for backward compatibility.""" cpu_arch: Optional[gpuhunt.CPUArchitecture] = None - # Deprecated: description is now generated client-side. TODO: remove in 0.21. + # TODO: remove `description` in 0.21. description: Annotated[ str, Field(description="Deprecated: generated client-side. Will be removed in 0.21."), ] = "" + """`description` is deprecated because it is now generated client-side.""" @root_validator def _description(cls, values) -> Dict: @@ -187,7 +190,8 @@ class RemoteConnectionInfo(CoreModel): class InstanceConfiguration(CoreModel): project_name: str instance_name: str - user: str # dstack user name + user: str + """`user` stores the dstack user name.""" ssh_keys: List[SSHKey] instance_id: Optional[str] = None reservation: Optional[str] = None @@ -208,7 +212,8 @@ class InstanceAvailability(Enum): AVAILABLE = "available" NOT_AVAILABLE = "not_available" NO_QUOTA = "no_quota" - NO_BALANCE = "no_balance" # For dstack Sky + NO_BALANCE = "no_balance" + """`NO_BALANCE` is used for dstack Sky.""" IDLE = "idle" BUSY = "busy" @@ -268,7 +273,8 @@ class InstanceTerminationReason(str, Enum): NO_OFFERS = "no_offers" MASTER_FAILED = "master_failed" MAX_INSTANCES_LIMIT = "max_instances_limit" - NO_BALANCE = "no_balance" # used in dstack Sky + NO_BALANCE = "no_balance" + """`NO_BALANCE` is used in dstack Sky.""" @classmethod def from_legacy_str(cls, v: str) -> "InstanceTerminationReason": @@ -332,14 +338,16 @@ class Instance(CoreModel): fleet_id: Optional[UUID] = None fleet_name: Optional[str] = None instance_num: int - job_name: Optional[str] = None # deprecated, always None (instance can have more than one job) + job_name: Optional[str] = None + """`job_name` is deprecated and always `None` because an instance can have more than one job.""" hostname: Optional[str] = None status: InstanceStatus unreachable: bool = False health_status: HealthStatus = HealthStatus.HEALTHY - # termination_reason stores InstanceTerminationReason. - # str allows adding new enum members without breaking compatibility with old clients. termination_reason: Optional[str] = None + """`termination_reason` stores `InstanceTerminationReason`. + `str` allows adding new enum members without breaking compatibility with old clients. + """ termination_reason_message: Optional[str] = None created: datetime.datetime finished_at: Optional[datetime.datetime] = None diff --git a/src/dstack/_internal/core/models/placement.py b/src/dstack/_internal/core/models/placement.py index 93b0cf09d0..a0ce418bca 100644 --- a/src/dstack/_internal/core/models/placement.py +++ b/src/dstack/_internal/core/models/placement.py @@ -16,7 +16,8 @@ class PlacementGroupConfiguration(CoreModel): class PlacementGroupProvisioningData(CoreModel): - backend: BackendType # can be different from configuration backend + backend: BackendType + """`backend` can be different from the backend in `configuration`.""" backend_data: Optional[str] = None diff --git a/src/dstack/_internal/core/models/repos/remote.py b/src/dstack/_internal/core/models/repos/remote.py index d3c3b70906..3bfd34024d 100644 --- a/src/dstack/_internal/core/models/repos/remote.py +++ b/src/dstack/_internal/core/models/repos/remote.py @@ -236,7 +236,8 @@ class GitRepoURL: ssh_port: Optional[str] path: str - original_host: str # before SSH config lookup + original_host: str + """`original_host` stores the host value before SSH config lookup.""" @staticmethod def parse( diff --git a/src/dstack/_internal/core/models/resources.py b/src/dstack/_internal/core/models/resources.py index 02cbbdc9b8..81230afcf3 100644 --- a/src/dstack/_internal/core/models/resources.py +++ b/src/dstack/_internal/core/models/resources.py @@ -375,7 +375,7 @@ def schema_extra(schema: Dict[str, Any]): class ResourcesSpec(generate_dual_core_model(ResourcesSpecConfig)): - # TODO: Remove Range[int] in 0.20. Range[int] for backward compatibility only. + # TODO: remove `Range[int]` in 0.20. It is kept only for backward compatibility. cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = ( CPUSpec() ) @@ -390,8 +390,8 @@ class ResourcesSpec(generate_dual_core_model(ResourcesSpecConfig)): "you may need to configure this" ), ] = None - # Optional for backward compatibility gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = DEFAULT_GPU_SPEC + """`gpu` is optional for backward compatibility.""" disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK def pretty_format(self) -> str: diff --git a/src/dstack/_internal/core/models/runs.py b/src/dstack/_internal/core/models/runs.py index 558b07e26e..bfef8bc786 100644 --- a/src/dstack/_internal/core/models/runs.py +++ b/src/dstack/_internal/core/models/runs.py @@ -210,9 +210,9 @@ class Requirements(CoreModel): max_price: Optional[float] = None spot: Optional[bool] = None reservation: Optional[str] = None - # Backends can use `multinode` to filter out offers if - # some offers support multinode and some do not. multinode: Optional[bool] = None + """Backends can use `multinode` to filter out offers when some offers support multinode and some do not. + """ def pretty_format(self, resources_only: bool = False): res = self.resources.pretty_format() @@ -241,7 +241,8 @@ class JobSSHKey(CoreModel): class ProbeSpec(CoreModel): - type: Literal["http"] # expect other probe types in the future, namely `exec` + type: Literal["http"] + """`type` currently expects `http`, but other probe types such as `exec` may be added later.""" url: str method: HTTPMethod = DEFAULT_PROBE_METHOD headers: list[HTTPHeaderSpec] = [] @@ -253,13 +254,16 @@ class ProbeSpec(CoreModel): class JobSpec(CoreModel): - replica_num: int = 0 # default value for backward compatibility + replica_num: int = 0 + """`replica_num` uses a default value for backward compatibility.""" job_num: int job_name: str - jobs_per_replica: int = 1 # default value for backward compatibility + jobs_per_replica: int = 1 + """`jobs_per_replica` uses a default value for backward compatibility.""" replica_group: str = DEFAULT_REPLICA_GROUP_NAME app_specs: Optional[List[AppSpec]] - user: Optional[UnixUser] = None # default value for backward compatibility + user: Optional[UnixUser] = None + """`user` uses a default value for backward compatibility.""" commands: List[str] env: Dict[str, str] home_dir: Optional[str] @@ -275,51 +279,61 @@ class JobSpec(CoreModel): volumes: Optional[List[MountPoint]] = None ssh_key: Optional[JobSSHKey] = None working_dir: Optional[str] - # `repo_data` is optional for client compatibility with pre-0.19.17 servers and for compatibility - # with jobs submitted before 0.19.17. All new jobs are expected to have non-None `repo_data`. - # For --no-repo runs, `repo_data` is `VirtualRunRepoData()`. repo_data: Annotated[Optional[AnyRunRepoData], Field(discriminator="repo_type")] = None - # `repo_code_hash` can be None because it is not used for the repo or because the job was - # submitted before 0.19.17. See `_get_repo_code_hash` on how to get the correct `repo_code_hash` - # TODO: drop this comment when supporting jobs submitted before 0.19.17 is no longer relevant. + """`repo_data` is optional for client compatibility with pre-0.19.17 servers and for jobs + submitted before 0.19.17. All new jobs are expected to have non-`None` `repo_data`. + For `--no-repo` runs, `repo_data` is `VirtualRunRepoData()`. + """ + # TODO: drop this compatibility note when support for jobs submitted before 0.19.17 is no longer relevant. repo_code_hash: Optional[str] = None - # `repo_dir` was added in 0.19.27. Default value is set for backward compatibility + """`repo_code_hash` can be `None` because it is not used for the repo or because the job was + submitted before 0.19.17. See `_get_repo_code_hash` for how to get the correct value. + """ repo_dir: str = LEGACY_REPO_DIR - # None for jobs without repo and any jobs submitted by pre-0.20.0 clients + """`repo_dir` was added in 0.19.27 and uses a default value for backward compatibility.""" repo_exists_action: Optional[RepoExistsAction] = None + """`repo_exists_action` is `None` for jobs without a repo and for jobs submitted by pre-0.20.0 clients.""" file_archives: list[FileArchiveMapping] = [] - # None for non-services and pre-0.19.19 services. See `get_service_port` service_port: Optional[int] = None + """`service_port` is `None` for non-services and pre-0.19.19 services. See `get_service_port`.""" probes: list[ProbeSpec] = [] class JobProvisioningData(CoreModel): backend: BackendType - # In case backend provisions instance in another backend, it may set that backend as base_backend. base_backend: Optional[BackendType] = None + """`base_backend` may be set when a backend provisions an instance in another backend and wants + to record that backend as `base_backend`. + """ instance_type: InstanceType instance_id: str - # hostname may not be set immediately after instance provisioning. - # It is set to a public IP or, if public IPs are disabled, to a private IP. hostname: Optional[str] = None + """`hostname` may not be set immediately after instance provisioning. + It is set to a public IP or, if public IPs are disabled, to a private IP. + """ internal_ip: Optional[str] = None - # public_ip_enabled can used to distinguished instances with and without public IPs. - # hostname being None is not enough since it can be filled after provisioning. public_ip_enabled: bool = True - # instance_network a network address for multimode installation. Specified as `/` - # internal_ip will be selected from the specified network + """`public_ip_enabled` is used to distinguish instances with and without public IPs. + `hostname` being `None` is not enough because it can be filled after provisioning. + """ instance_network: Optional[str] = None + """`instance_network` stores the multimode installation network, specified as + `/`. `internal_ip` will be selected from the specified network. + """ region: str availability_zone: Optional[str] = None reservation: Optional[str] = None price: float username: str - # ssh_port be different from 22 for some backends. - # ssh_port may not be set immediately after instance provisioning ssh_port: Optional[int] = None - dockerized: bool # True if backend starts shim + """`ssh_port` may be different from 22 for some backends and may not be set immediately after + instance provisioning. + """ + dockerized: bool + """`dockerized` is `True` when the backend starts the shim.""" ssh_proxy: Optional[SSHConnectionParams] = None - backend_data: Optional[str] = None # backend-specific data in json + backend_data: Optional[str] = None + """`backend_data` stores backend-specific data in JSON.""" def get_base_backend(self) -> BackendType: if self.base_backend is not None: @@ -340,22 +354,29 @@ class JobRuntimeData(CoreModel): """ network_mode: NetworkMode - # GPU, CPU, memory resource shares. None means all available (no limit) gpu: Optional[int] = None + """`gpu` stores the GPU resource share. `None` means all available with no limit.""" cpu: Optional[float] = None + """`cpu` stores the CPU resource share. `None` means all available with no limit.""" memory: Optional[Memory] = None - # container:host port mapping reported by shim. Empty dict if network_mode == NetworkMode.HOST - # None if data is not yet available (on vm-based backends and ssh instances) - # or not applicable (container-based backends) + """`memory` stores the memory resource share. `None` means all available with no limit.""" ports: Optional[dict[int, int]] = None - # List of volumes used by the job - volume_names: Optional[list[str]] = None # None for backward compatibility - # Virtual shared offer - offer: Optional[InstanceOfferWithAvailability] = None # None for backward compatibility - # Resolved working directory and OS username reported by the runner. - # None if the runner hasn't reported them yet or if it's an old runner. + """`ports` stores the container-to-host port mapping reported by shim. It is an empty dict if + `network_mode == NetworkMode.HOST`. `None` if data is not yet available + on VM-based backends and SSH instances, or not applicable on container-based backends. + """ + volume_names: Optional[list[str]] = None + """`volume_names` stores the list of volumes used by the job. It is `None` for backward compatibility.""" + offer: Optional[InstanceOfferWithAvailability] = None + """`offer` stores the virtual shared offer. It is `None` for backward compatibility.""" working_dir: Optional[str] = None + """`working_dir` stores the resolved working directory reported by the runner. + `None` if the runner has not reported it yet or if it is an old runner. + """ username: Optional[str] = None + """`username` stores the resolved OS username reported by the runner. + `None` if the runner has not reported it yet or if it is an old runner. + """ class ClusterInfo(CoreModel): @@ -371,16 +392,19 @@ class Probe(CoreModel): class JobSubmission(CoreModel): id: UUID4 submission_num: int - deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers + deployment_num: int = 0 + """`deployment_num` uses a default value for compatibility with pre-0.19.14 servers.""" submitted_at: datetime last_processed_at: datetime finished_at: Optional[datetime] = None inactivity_secs: Optional[int] = None status: JobStatus - status_message: str = "" # default for backward compatibility - # termination_reason stores JobTerminationReason. - # str allows adding new enum members without breaking compatibility with old clients. + status_message: str = "" + """`status_message` uses a default value for backward compatibility.""" termination_reason: Optional[str] = None + """`termination_reason` stores `JobTerminationReason`. + `str` allows adding new enum members without breaking compatibility with old clients. + """ termination_reason_message: Optional[str] = None exit_status: Optional[int] = None job_provisioning_data: Optional[JobProvisioningData] = None @@ -413,7 +437,7 @@ def schema_extra(schema: Dict[str, Any]): class RunSpec(generate_dual_core_model(RunSpecConfig)): - # TODO: run_name is redundant here since they already passed in configuration + # TODO: consider removing `run_name` here because it is already passed in `configuration`. run_name: Annotated[ Optional[str], Field(description="The run name. If not set, the run name is generated automatically."), @@ -452,9 +476,10 @@ class RunSpec(generate_dual_core_model(RunSpecConfig)): list[FileArchiveMapping], Field(description="The list of file archive ID to container path mappings."), ] = [] - # Server uses configuration.working_dir since 0.19.27 and ignores this field, but the field - # still exists for compatibility with old clients that send it. working_dir: Optional[str] = None + """`working_dir` is kept for compatibility with old clients that still send it, even though the + server uses `configuration.working_dir` since 0.19.27 and ignores this field. + """ configuration_path: Annotated[ Optional[str], Field( @@ -473,10 +498,11 @@ class RunSpec(generate_dual_core_model(RunSpecConfig)): " Can be empty only before the run is submitted." ), ] = None - # merged_profile stores profile parameters merged from profile and configuration. - # Read profile parameters from merged_profile instead of profile directly. - # TODO: make merged_profile a computed field after migrating to pydanticV2 + # TODO: make `merged_profile` a computed field after migrating to Pydantic v2. merged_profile: Annotated[Profile, Field(exclude=True)] = None + """`merged_profile` stores profile parameters merged from `profile` and `configuration`. + Read profile parameters from `merged_profile` instead of `profile` directly. + """ @root_validator def _merged_profile(cls, values) -> Dict: @@ -546,16 +572,19 @@ class Run(CoreModel): submitted_at: datetime last_processed_at: datetime status: RunStatus - status_message: str = "" # default for backward compatibility - # termination_reason stores RunTerminationReason. - # str allows adding new enum members without breaking compatibility with old clients. + status_message: str = "" + """`status_message` uses a default value for backward compatibility.""" termination_reason: Optional[str] = None + """`termination_reason` stores `RunTerminationReason`. + `str` allows adding new enum members without breaking compatibility with old clients. + """ run_spec: RunSpec jobs: List[Job] latest_job_submission: Optional[JobSubmission] = None cost: float = 0 service: Optional[ServiceSpec] = None - deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers + deployment_num: int = 0 + """`deployment_num` uses a default value for compatibility with pre-0.19.14 servers.""" error: Optional[str] = None deleted: Optional[bool] = None next_triggered_at: Optional[datetime] = None diff --git a/src/dstack/_internal/core/models/volumes.py b/src/dstack/_internal/core/models/volumes.py index 280ab14f10..701611402b 100644 --- a/src/dstack/_internal/core/models/volumes.py +++ b/src/dstack/_internal/core/models/volumes.py @@ -17,9 +17,10 @@ class VolumeStatus(str, Enum): SUBMITTED = "submitted" - # PROVISIONING is currently not used since on all backends supporting volumes, - # volumes become ACTIVE (ready to be used) almost immediately after provisioning. PROVISIONING = "provisioning" + """`PROVISIONING` is currently not used because on all backends supporting volumes, + volumes become `ACTIVE` almost immediately after provisioning. + """ ACTIVE = "active" FAILED = "failed" @@ -88,12 +89,13 @@ class VolumeProvisioningData(CoreModel): volume_id: str size_gb: int availability_zone: Optional[str] = None - # price per month price: Optional[float] = None - # should be manually attached/detached + """`price` stores the monthly price.""" attachable: bool = True + """`attachable` shows whether the volume should be attached and detached manually.""" detachable: bool = True - backend_data: Optional[str] = None # backend-specific data in json + backend_data: Optional[str] = None + """`backend_data` stores backend-specific data in JSON.""" class VolumeAttachmentData(CoreModel): @@ -125,13 +127,15 @@ class Volume(CoreModel): status_message: Optional[str] = None deleted: bool deleted_at: Optional[datetime] = None - volume_id: Optional[str] = None # id of the volume in the cloud + volume_id: Optional[str] = None + """`volume_id` is the volume identifier in the cloud provider.""" provisioning_data: Optional[VolumeProvisioningData] = None cost: float = 0 attachments: Optional[List[VolumeAttachment]] = None - # attachment_data is deprecated in favor of attachments. - # It's only set for volumes that were attached before attachments. attachment_data: Optional[VolumeAttachmentData] = None + """`attachment_data` is deprecated in favor of `attachments`. + It is only set for volumes that were attached before attachments were introduced. + """ def get_attachment_data_for_instance(self, instance_id: str) -> Optional[VolumeAttachmentData]: if self.attachments is not None: diff --git a/src/dstack/_internal/server/models.py b/src/dstack/_internal/server/models.py index d1a30b941b..da733054fa 100644 --- a/src/dstack/_internal/server/models.py +++ b/src/dstack/_internal/server/models.py @@ -84,9 +84,11 @@ class DecryptedString(generate_dual_core_model(DecryptedStringConfig)): This is useful so that application code can have custom handling of failed decrypts (e.g. ignoring). """ - # Do not read plaintext directly to avoid ignoring errors accidentally. - # Unpack with get_plaintext_or_error(). plaintext: Optional[str] + """ + `plaintext` should not be read directly to avoid ignoring errors accidentally. + Unpack with `get_plaintext_or_error()`. + """ decrypted: bool = True exc: Optional[Exception] = None @@ -211,20 +213,26 @@ class UserModel(BaseModel): name: Mapped[str] = mapped_column(String(50), unique=True) created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) token: Mapped[DecryptedString] = mapped_column(EncryptedString(200), unique=True) - # token_hash is needed for fast search by token when stored token is encrypted token_hash: Mapped[str] = mapped_column(String(2000), unique=True) + """`token_hash` is used for fast token lookup when the stored token is encrypted.""" global_role: Mapped[GlobalRole] = mapped_column(EnumAsString(GlobalRole, 100)) - # deactivated users cannot access API active: Mapped[bool] = mapped_column(Boolean, default=True) + """`active` controls whether the user can access the API.""" deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) - # `original_name` stores the name of a deleted user, while `name` is changed to a unique generated value. original_name: Mapped[Optional[str]] = mapped_column(String(50), nullable=True) + """`original_name` stores the deleted user's original name while `name` is changed to a unique + generated value. + """ - # SSH keys can be null for users created before 0.19.33. - # Keys for those users are being gradually generated on /get_my_user calls. - # TODO: make keys required in a future version. + # TODO: make these keys required in a future version. ssh_private_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + """`ssh_private_key` can be `null` for users created before 0.19.33. + Keys for those users are being gradually generated on `/get_my_user` calls. + """ ssh_public_key: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + """`ssh_public_key` can be `null` for users created before 0.19.33. + Keys for those users are being gradually generated on `/get_my_user` calls. + """ email: Mapped[Optional[str]] = mapped_column(String(200), nullable=True, index=True) @@ -243,8 +251,10 @@ class ProjectModel(BaseModel): created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) is_public: Mapped[bool] = mapped_column(Boolean, default=False) deleted: Mapped[bool] = mapped_column(Boolean, default=False) - # `original_name` stores the name of a deleted project, while `name` is changed to a unique generated value. original_name: Mapped[Optional[str]] = mapped_column(String(50), nullable=True) + """`original_name` stores the deleted project's original name while `name` is changed to a unique + generated value. + """ owner_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) owner: Mapped[UserModel] = relationship(lazy="joined") @@ -264,14 +274,15 @@ class ProjectModel(BaseModel): foreign_keys=[default_gateway_id] ) - # TODO: Drop after the release without pools - # Note that multi-replica deployments can break if - # upgrading from an old version that uses pools to the version that drops pools from the DB. + # TODO: drop `default_pool_id` after the release without pools. default_pool_id: Mapped[Optional[UUIDType]] = mapped_column( ForeignKey("pools.id", use_alter=True, ondelete="SET NULL"), nullable=True, deferred=True, # Not loaded so it can be deleted in the next releases ) + """`default_pool_id` exists because multi-replica deployments can break when upgrading from an + old version that uses pools to the version that drops pools from the database. + """ default_pool: Mapped[Optional["PoolModel"]] = relationship(foreign_keys=[default_pool_id]) @@ -286,8 +297,8 @@ class MemberModel(BaseModel): user_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) user: Mapped[UserModel] = relationship(lazy="joined") project_role: Mapped[ProjectRole] = mapped_column(EnumAsString(ProjectRole, 100)) - # member_num defines members ordering member_num: Mapped[Optional[int]] = mapped_column(Integer) + """`member_num` defines member ordering.""" class BackendModel(BaseModel): @@ -315,16 +326,18 @@ class RepoModel(BaseModel): ) project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) project: Mapped["ProjectModel"] = relationship() - # RepoModel.name stores repo_id name: Mapped[str] = mapped_column(String(100)) + """`name` stores `repo_id`.""" type: Mapped[RepoType] = mapped_column(EnumAsString(RepoType, 100)) info: Mapped[str] = mapped_column(Text) - # `creds` is deprecated, for newly initialized repos per-user `RepoCredsModel` should be used - # instead. As of 0.18.25, there is no plan to remove this field, it's used as a fallback when - # `RepoCredsModel` associated with the user is not found. creds: Mapped[Optional[str]] = mapped_column(String(5000)) + """ + `creds` is deprecated. Newly initialized repos should use per-user `RepoCredsModel` instead. + As of 0.18.25 there is no plan to remove this field; it is used as a fallback when + `RepoCredsModel` associated with the user is not found. + """ class RepoCredsModel(BaseModel): @@ -354,7 +367,8 @@ class CodeModel(BaseModel): repo_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("repos.id", ondelete="CASCADE")) repo: Mapped["RepoModel"] = relationship() blob_hash: Mapped[str] = mapped_column(String(4000)) - blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) # None means blob is stored on s3 + blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) + """`blob` is stored on S3 when it is `None`.""" class FileArchiveModel(BaseModel): @@ -369,7 +383,8 @@ class FileArchiveModel(BaseModel): user_id: Mapped["UserModel"] = mapped_column(ForeignKey("users.id", ondelete="CASCADE")) user: Mapped["UserModel"] = relationship() blob_hash: Mapped[str] = mapped_column(Text) - blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) # None means blob is stored on s3 + blob: Mapped[Optional[bytes]] = mapped_column(LargeBinary) + """`blob` is stored on S3 when it is `None`.""" class RunModel(BaseModel): @@ -389,23 +404,26 @@ class RunModel(BaseModel): repo_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("repos.id", ondelete="CASCADE")) repo: Mapped["RepoModel"] = relationship() - # Runs reference fleets so that fleets cannot be deleted while they are used. - # A fleet can have no busy instances but still be used by a run (e.g. a service with 0 replicas). fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id")) + """`fleet_id` keeps runs attached to fleets so the fleets cannot be deleted while they are used. + A fleet can have no busy instances but still be used by a run, for example a service with + zero replicas. + """ fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="runs") run_name: Mapped[str] = mapped_column(String(100)) submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime) last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime) next_triggered_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # NOTE: `status` must be changed only via `switch_run_status()` status: Mapped[RunStatus] = mapped_column(EnumAsString(RunStatus, 100), index=True) + """`status` must be changed only via `switch_run_status()`.""" termination_reason: Mapped[Optional[RunTerminationReason]] = mapped_column( EnumAsString(RunTerminationReason, 100) ) - # resubmission_attempt counts consecutive transitions to pending without provisioning. - # Can be used to choose retry delay depending on the attempt number. resubmission_attempt: Mapped[int] = mapped_column(Integer, default=0) + """`resubmission_attempt` counts consecutive transitions to pending without provisioning. + It can be used to choose a retry delay based on the attempt number. + """ run_spec: Mapped[str] = mapped_column(Text) service_spec: Mapped[Optional[str]] = mapped_column(Text) priority: Mapped[int] = mapped_column(Integer, default=0) @@ -439,9 +457,10 @@ class JobModel(BaseModel): ) run: Mapped["RunModel"] = relationship() - # Jobs need to reference fleets because we may choose an optimal fleet for a master job - # but not yet create an instance for it. fleet_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("fleets.id")) + """`fleet_id` keeps jobs attached to fleets because we may choose an optimal fleet for a master + job but not yet create an instance for it. + """ fleet: Mapped[Optional["FleetModel"]] = relationship(back_populates="jobs") run_name: Mapped[str] = mapped_column(String(100)) @@ -450,26 +469,29 @@ class JobModel(BaseModel): submission_num: Mapped[int] = mapped_column(Integer) submitted_at: Mapped[datetime] = mapped_column(NaiveDateTime) last_processed_at: Mapped[datetime] = mapped_column(NaiveDateTime) - # NOTE: `status` must be changed only via `switch_job_status()` status: Mapped[JobStatus] = mapped_column(EnumAsString(JobStatus, 100), index=True) + """`status` must be changed only via `switch_job_status()`.""" termination_reason: Mapped[Optional[JobTerminationReason]] = mapped_column( EnumAsString(JobTerminationReason, 100) ) termination_reason_message: Mapped[Optional[str]] = mapped_column(Text) - # `disconnected_at` stores the first time of connectivity issues with the instance. - # Resets every time connectivity is restored. disconnected_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + """`disconnected_at` stores the first time connectivity issues were seen with the instance. + It resets every time connectivity is restored. + """ exit_status: Mapped[Optional[int]] = mapped_column(Integer) job_spec_data: Mapped[str] = mapped_column(Text) job_provisioning_data: Mapped[Optional[str]] = mapped_column(Text) runner_timestamp: Mapped[Optional[int]] = mapped_column(BigInteger) - inactivity_secs: Mapped[Optional[int]] = mapped_column(Integer) # 0 - active, None - N/A - # `removed` is used to ensure that the instance is killed after the job is finished + inactivity_secs: Mapped[Optional[int]] = mapped_column(Integer) + """`inactivity_secs` uses `0` for active jobs and `None` when inactivity is not applicable.""" remove_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) + """`remove_at` is used to ensure the instance is killed after the job is finished.""" volumes_detached_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # `instance_assigned` means instance assignment was done. - # if `instance_assigned` is True and `instance` is None, no instance was assigned. instance_assigned: Mapped[bool] = mapped_column(Boolean, default=False) + """`instance_assigned` shows whether instance assignment has already been attempted. + If `instance_assigned` is `True` and `instance` is `None`, no instance was assigned. + """ instance_id: Mapped[Optional[uuid.UUID]] = mapped_column( ForeignKey("instances.id", ondelete="CASCADE") ) @@ -481,15 +503,16 @@ class JobModel(BaseModel): probes: Mapped[list["ProbeModel"]] = relationship( back_populates="job", order_by="ProbeModel.probe_num" ) - # Whether the replica is registered to receive service requests. - # Always `False` for non-service runs. registered: Mapped[bool] = mapped_column(Boolean, server_default=false()) - # `waiting_master_job` is `True` for non-master jobs that have to wait - # for master processing before they can be processed. - # This allows updating all replica jobs even when only master is locked, - # e.g. to provision instances for all jobs when processing master. - # If not set, all jobs should be processed only one-by-one. + """`registered` shows whether the replica is registered to receive service requests. + It is always `False` for non-service runs. + """ waiting_master_job: Mapped[Optional[bool]] = mapped_column(Boolean) + """`waiting_master_job` is `True` for non-master jobs that have to wait for master processing before + they can be processed. This allows updating all replica jobs even when only master is locked, + for example to provision instances for all jobs when processing master. If not set, all jobs + should be processed only one-by-one. + """ class GatewayModel(PipelineModelMixin, BaseModel): @@ -501,9 +524,10 @@ class GatewayModel(PipelineModelMixin, BaseModel): name: Mapped[str] = mapped_column(String(100)) region: Mapped[str] = mapped_column(String(100)) wildcard_domain: Mapped[Optional[str]] = mapped_column(String(100)) - # `configuration` is optional for compatibility with pre-0.18.2 gateways. - # Use `get_gateway_configuration` to construct `configuration` for old gateways. configuration: Mapped[Optional[str]] = mapped_column(Text) + """`configuration` is Optional for compatibility with pre-0.18.2 gateways. + Use `get_gateway_configuration` to construct `configuration` for old gateways. + """ created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) status: Mapped[GatewayStatus] = mapped_column(EnumAsString(GatewayStatus, 100)) status_message: Mapped[Optional[str]] = mapped_column(Text) @@ -537,9 +561,10 @@ class GatewayComputeModel(BaseModel): instance_id: Mapped[str] = mapped_column(String(100)) ip_address: Mapped[str] = mapped_column(String(100)) hostname: Mapped[Optional[str]] = mapped_column(String(100)) - # `configuration` is optional for compatibility with pre-0.18.2 gateways. - # Use `get_gateway_compute_configuration` to construct `configuration` for old gateways. configuration: Mapped[Optional[str]] = mapped_column(Text) + """`configuration` is optional for compatibility with pre-0.18.2 gateways. + Use `get_gateway_compute_configuration` to construct `configuration` for old gateways. + """ backend_data: Mapped[Optional[str]] = mapped_column(Text) region: Mapped[str] = mapped_column(String(100)) @@ -548,12 +573,12 @@ class GatewayComputeModel(BaseModel): ) backend: Mapped[Optional["BackendModel"]] = relationship() - # The key to authorize the server with the gateway ssh_private_key: Mapped[str] = mapped_column(Text) + """`ssh_private_key` is the key used to authorize the server with the gateway.""" ssh_public_key: Mapped[str] = mapped_column(Text) - # active means the server should maintain connection to gateway. active: Mapped[bool] = mapped_column(Boolean, default=True) + """`active` means the server should maintain a connection to the gateway.""" deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) app_updated_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) @@ -594,8 +619,8 @@ class FleetModel(PipelineModelMixin, BaseModel): deleted: Mapped[bool] = mapped_column(Boolean, default=False) deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # NOTE: `status` must be changed only via `switch_fleet_status()` status: Mapped[FleetStatus] = mapped_column(EnumAsString(FleetStatus, 100), index=True) + """`status` must be changed only via `switch_fleet_status()`.""" status_message: Mapped[Optional[str]] = mapped_column(Text) spec: Mapped[str] = mapped_column(Text) @@ -611,9 +636,10 @@ class FleetModel(PipelineModelMixin, BaseModel): UUIDType(binary=False), index=True ) - # `consolidation_attempt` counts how many times in a row fleet needed consolidation. - # Allows increasing delays between attempts. consolidation_attempt: Mapped[int] = mapped_column(Integer, server_default="0") + """`consolidation_attempt` counts how many times in a row the fleet needed consolidation. + It allows increasing delays between attempts. + """ last_consolidated_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) __table_args__ = ( @@ -646,7 +672,7 @@ class InstanceModel(PipelineModelMixin, BaseModel): project_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("projects.id", ondelete="CASCADE")) project: Mapped["ProjectModel"] = relationship(foreign_keys=[project_id]) - # TODO: Drop after the release without pools + # TODO: drop `pool_id` after the release without pools. pool_id: Mapped[Optional[uuid.UUID]] = mapped_column( ForeignKey("pools.id"), deferred=True, # Not loaded so it can be deleted in the next releases @@ -662,38 +688,36 @@ class InstanceModel(PipelineModelMixin, BaseModel): compute_group_id: Mapped[Optional[uuid.UUID]] = mapped_column(ForeignKey("compute_groups.id")) compute_group: Mapped[Optional["ComputeGroupModel"]] = relationship(back_populates="instances") - # NOTE: `status` must be changed only via `switch_instance_status()` status: Mapped[InstanceStatus] = mapped_column(EnumAsString(InstanceStatus, 100), index=True) + """`status` must be changed only via `switch_instance_status()`.""" unreachable: Mapped[bool] = mapped_column(Boolean) - # VM started_at: Mapped[Optional[datetime]] = mapped_column( NaiveDateTime, default=get_current_datetime ) + """`started_at` is used only for VM instances.""" finished_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # create instance - # TODO: Introduce a field that would store all resolved instance profile parameters, etc, (similar to job_spec). - # Currently, profile parameters are parsed every time they are accessed (e.g. see profile.retry). + # TODO: introduce a field that stores all resolved instance profile parameters, similar to `job_spec`. profile: Mapped[Optional[str]] = mapped_column(Text) + """`profile` stores raw profile data. Profile parameters are currently parsed every time they are + accessed, for example through `profile.retry`. + """ requirements: Mapped[Optional[str]] = mapped_column(Text) instance_configuration: Mapped[Optional[str]] = mapped_column(Text) termination_policy: Mapped[Optional[TerminationPolicy]] = mapped_column(String(100)) - # TODO: Suggestion: do not assign DEFAULT_FLEET_TERMINATION_IDLE_TIME as the default here - # (make Optional instead; also instead of -1) + # TODO: consider not assigning `DEFAULT_FLEET_TERMINATION_IDLE_TIME` here and making this optional. termination_idle_time: Mapped[int] = mapped_column( Integer, default=DEFAULT_FLEET_TERMINATION_IDLE_TIME ) + """`termination_idle_time` stores the idle timeout used for termination decisions.""" - # Deprecated last_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime, deferred=True) + """`last_retry_at` is deprecated.""" - # instance termination handling termination_deadline: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) - # dstack versions prior to 0.20.1 represented instance termination reasons as raw strings. - # Such strings may still be stored in the database, so we are using a wide column (4000 chars) - # and a fallback deserializer to convert them to relevant enum members. + """`termination_deadline` is used for instance termination handling.""" termination_reason: Mapped[Optional[InstanceTerminationReason]] = mapped_column( EnumAsString( InstanceTerminationReason, @@ -701,9 +725,13 @@ class InstanceModel(PipelineModelMixin, BaseModel): fallback_deserializer=InstanceTerminationReason.from_legacy_str, ) ) + """`termination_reason` may need legacy deserialization because dstack versions prior to 0.20.1 represented instance termination + reasons as raw strings. Such strings may still be stored in the database, so this uses a + wide column and a fallback deserializer to convert them to relevant enum members. + """ termination_reason_message: Mapped[Optional[str]] = mapped_column(String(4000)) - # Deprecated since 0.19.22, not used health_status: Mapped[Optional[str]] = mapped_column(String(4000), deferred=True) + """`health_status` is deprecated since 0.19.22 and is no longer used.""" health: Mapped[HealthStatus] = mapped_column( EnumAsString(HealthStatus, 100), default=HealthStatus.HEALTHY ) @@ -713,8 +741,8 @@ class InstanceModel(PipelineModelMixin, BaseModel): backend: Mapped[Optional[BackendType]] = mapped_column(EnumAsString(BackendType, 100)) backend_data: Mapped[Optional[str]] = mapped_column(Text) - # Not set for cloud fleets that haven't been provisioning offer: Mapped[Optional[str]] = mapped_column(Text) + """`offer` is not set for cloud fleets that have not started provisioning.""" region: Mapped[Optional[str]] = mapped_column(String(2000)) price: Mapped[Optional[float]] = mapped_column(Float) @@ -722,8 +750,8 @@ class InstanceModel(PipelineModelMixin, BaseModel): remote_connection_info: Mapped[Optional[str]] = mapped_column(Text) - # NULL means `auto` (only during provisioning, when ready it's not NULL) total_blocks: Mapped[Optional[int]] = mapped_column(Integer) + """`total_blocks` uses `NULL` to mean `auto` during provisioning; once ready it is not `NULL`.""" busy_blocks: Mapped[int] = mapped_column(Integer, default=0) jobs: Mapped[list["JobModel"]] = relationship(back_populates="instance") @@ -785,19 +813,19 @@ class VolumeModel(PipelineModelMixin, BaseModel): deleted_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime) to_be_deleted: Mapped[bool] = mapped_column(Boolean, server_default=false()) - # NOTE: `status` must be changed only via `switch_volume_status()` status: Mapped[VolumeStatus] = mapped_column(EnumAsString(VolumeStatus, 100), index=True) + """`status` must be changed only via `switch_volume_status()`.""" status_message: Mapped[Optional[str]] = mapped_column(Text) configuration: Mapped[str] = mapped_column(Text) volume_provisioning_data: Mapped[Optional[str]] = mapped_column(Text) - # auto_cleanup_enabled is set for all new models but old models may not have it. auto_cleanup_enabled: Mapped[Optional[bool]] = mapped_column(Boolean) + """`auto_cleanup_enabled` is set for all new models, but old models may not have it.""" attachments: Mapped[List["VolumeAttachmentModel"]] = relationship(back_populates="volume") - # Deprecated in favor of VolumeAttachmentModel.attachment_data volume_attachment_data: Mapped[Optional[str]] = mapped_column(Text) + """`volume_attachment_data` is deprecated in favor of `VolumeAttachmentModel.attachment_data`.""" __table_args__ = ( Index( @@ -832,7 +860,7 @@ class PlacementGroupModel(PipelineModelMixin, BaseModel): fleet_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("fleets.id")) fleet: Mapped["FleetModel"] = relationship(foreign_keys=[fleet_id]) - # TODO: rename `fleet_deleted` -> `to_be_deleted` + # TODO: rename `fleet_deleted` to `to_be_deleted`. fleet_deleted: Mapped[bool] = mapped_column(Boolean, default=False) created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime) @@ -908,9 +936,10 @@ class JobMetricsPoint(BaseModel): memory_usage_bytes: Mapped[int] = mapped_column(BigInteger) memory_working_set_bytes: Mapped[int] = mapped_column(BigInteger) - # json-encoded lists of metric values of len(gpus) length gpus_memory_usage_bytes: Mapped[str] = mapped_column(Text) + """`gpus_memory_usage_bytes` stores a JSON-encoded list of metric values with length `len(gpus)`.""" gpus_util_percent: Mapped[str] = mapped_column(Text) + """`gpus_util_percent` stores a JSON-encoded list of metric values with length `len(gpus)`.""" class JobPrometheusMetrics(BaseModel): @@ -920,8 +949,8 @@ class JobPrometheusMetrics(BaseModel): job: Mapped["JobModel"] = relationship() collected_at: Mapped[datetime] = mapped_column(NaiveDateTime) - # Raw Prometheus text response text: Mapped[str] = mapped_column(Text) + """`text` stores the raw Prometheus text response.""" class ProbeModel(BaseModel): @@ -936,7 +965,8 @@ class ProbeModel(BaseModel): job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), primary_key=True) job: Mapped["JobModel"] = relationship(back_populates="probes") - probe_num: Mapped[int] = mapped_column(Integer) # index in JobSpec.probes + probe_num: Mapped[int] = mapped_column(Integer) + """`probe_num` is the index in `JobSpec.probes`.""" due: Mapped[datetime] = mapped_column(NaiveDateTime) success_streak: Mapped[int] = mapped_column(BigInteger) active: Mapped[bool] = mapped_column(Boolean) diff --git a/src/dstack/_internal/server/schemas/health/dcgm.py b/src/dstack/_internal/server/schemas/health/dcgm.py index f6aeaa40e5..cf8f5ce506 100644 --- a/src/dstack/_internal/server/schemas/health/dcgm.py +++ b/src/dstack/_internal/server/schemas/health/dcgm.py @@ -32,17 +32,20 @@ class DCGMHealthIncident(CoreModel): See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/health.go#L68-L73 """ - # dcgmIncidentInfo_t system: int + """`system` comes from `dcgmIncidentInfo_t`.""" health: DCGMHealthResult + """`health` comes from `dcgmIncidentInfo_t`.""" - # dcgmDiagErrorDetail_t error_message: str + """`error_message` comes from `dcgmDiagErrorDetail_t`.""" error_code: int + """`error_code` comes from `dcgmDiagErrorDetail_t`.""" - # dcgmGroupEntityPair_t - entity_group_id: int # dcgmGroupEntityPair_t + entity_group_id: int + """`entity_group_id` comes from `dcgmGroupEntityPair_t`.""" entity_id: int + """`entity_id` comes from `dcgmGroupEntityPair_t`.""" class DCGMHealthResponse(CoreModel): diff --git a/src/dstack/_internal/server/schemas/runner.py b/src/dstack/_internal/server/schemas/runner.py index 89649ddda6..43e9ddbb82 100644 --- a/src/dstack/_internal/server/schemas/runner.py +++ b/src/dstack/_internal/server/schemas/runner.py @@ -28,7 +28,8 @@ class JobStateEvent(CoreModel): class LogEvent(CoreModel): - timestamp: int # milliseconds + timestamp: int + """`timestamp` is stored in milliseconds.""" message: bytes @validator("message", pre=True) @@ -43,7 +44,8 @@ class PullResponse(CoreModel): job_logs: List[LogEvent] runner_logs: List[LogEvent] last_updated: int - no_connections_secs: Optional[int] = None # Optional for compatibility with old runners + no_connections_secs: Optional[int] = None + """`no_connections_secs` is optional for compatibility with old runners.""" class JobInfoResponse(CoreModel): @@ -101,8 +103,7 @@ class SubmitBody(CoreModel): cluster_info: Annotated[Optional[ClusterInfo], Field(include=True)] secrets: Annotated[Optional[Dict[str, str]], Field(include=True)] repo_credentials: Annotated[Optional[RemoteRepoCreds], Field(include=True)] - # run_spec is deprecated in favor of run.run_spec - # TODO: Remove once we no longer support instances deployed with 0.19.8 or earlier. + # TODO: remove `run_spec` once instances deployed with 0.19.8 or earlier are no longer supported. run_spec: Annotated[ RunSpec, Field( @@ -115,6 +116,7 @@ class SubmitBody(CoreModel): }, ), ] + """`run_spec` is deprecated in favor of `run.run_spec`.""" class HealthcheckResponse(CoreModel): @@ -143,7 +145,8 @@ class ComponentStatus(str, Enum): class ComponentInfo(CoreModel): - name: str # Not using ComponentName enum for compatibility of newer shim with older server + name: str + """`name` does not use `ComponentName` so newer shim versions remain compatible with the older server.""" version: str status: ComponentStatus @@ -203,8 +206,10 @@ class TaskListItem(CoreModel): class TaskListResponse(CoreModel): - ids: Optional[list[str]] = None # returned by pre-0.19.26 shim - tasks: Optional[list[TaskListItem]] = None # returned by 0.19.26+ shim + ids: Optional[list[str]] = None + """`ids` is returned by pre-0.19.26 shim versions.""" + tasks: Optional[list[TaskListItem]] = None + """`tasks` is returned by shim versions 0.19.26 and newer.""" class TaskInfoResponse(CoreModel): @@ -212,8 +217,10 @@ class TaskInfoResponse(CoreModel): status: TaskStatus termination_reason: str termination_message: str - # default value for backward compatibility with 0.18.34, could be removed after a few releases ports: Optional[list[PortMapping]] = [] + """`ports` uses a default value for backward compatibility with 0.18.34. + It can be removed after a few releases. + """ class TaskSubmitRequest(CoreModel):