Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .agents/skills/transformerlab-cli/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -886,6 +886,7 @@ This applies to launching jobs, fetching logs, checking cluster status, and ever
| `lab provider update <id>` | Update provider config | No |
| `lab provider delete <id>` | Delete a provider (`--no-interactive` to skip prompt) | No |
| `lab provider check <id>` | Check provider health | No |
| `lab provider verify-lifecycle <id>` | Verify provider lifecycle via a storage probe (`--no-wait` to launch only; see `--help` for polling options) | No |
| `lab provider enable <id>` | Enable a provider | No |
| `lab provider disable <id>` | Disable a provider | No |
| `lab model list` | List all model groups | No |
Expand Down
2 changes: 1 addition & 1 deletion cli/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "transformerlab-cli"
version = "0.0.59"
version = "0.0.60"
description = "Transformer Lab CLI"
requires-python = ">=3.10"
authors = [{ name = "Transformer Lab", email = "hello@transformerlab.ai" }]
Expand Down
95 changes: 95 additions & 0 deletions cli/src/transformerlab_cli/commands/provider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import time

import typer

Expand Down Expand Up @@ -617,6 +618,100 @@ def command_provider_check(
raise typer.Exit(1)


@app.command("verify-lifecycle")
def command_provider_verify_lifecycle(
provider_id: str = typer.Argument(..., help="Provider ID to verify"),
no_wait: bool = typer.Option(
False,
"--no-wait",
help="Launch the probe job and print its job ID without waiting for the result.",
),
poll_interval: float = typer.Option(
20.0,
"--poll-interval",
help="Seconds to wait between checks for the sentinel file.",
),
max_polls: int = typer.Option(
10,
"--max-polls",
help="Maximum number of times to check for the sentinel file before giving up.",
),
):
"""Verify a compute provider's lifecycle by running a storage probe.

Launches a minimal probe job on the provider that writes a sentinel file to
shared storage, then polls until the file appears (pass) or it times out (fail).
This confirms the provider can launch a job and reach shared storage end-to-end.
"""
check_configs(output_format=cli_state.output_format)

base = f"/compute_provider/providers/{provider_id}/debug/storage-probe"

with console.status(
f"[bold success]Launching storage probe on provider {provider_id}...[/bold success]", spinner="dots"
):
launch_res = api.post_json(base, timeout=60.0)

if launch_res.status_code == 404:
console.print(f"[error]Error:[/error] Provider {provider_id} not found.")
raise typer.Exit(1)
if launch_res.status_code != 200:
console.print(f"[error]Error:[/error] Failed to launch storage probe. {_extract_error_detail(launch_res)}")
raise typer.Exit(1)

job_id = launch_res.json().get("job_id")
if job_id is None:
console.print("[error]Error:[/error] Launch did not return a job ID.")
raise typer.Exit(1)

if no_wait:
if cli_state.output_format == "json":
print(json.dumps({"job_id": job_id, "status": "launched"}))
else:
console.print(f"[success]✓[/success] Storage probe launched as job [bold]{job_id}[/bold].")
console.print(f" Inspect it with: lab job info {job_id}")
return

check_url = f"{base}/{job_id}"
Comment thread
dadmobile marked this conversation as resolved.
found = False
last_path = None
# TODO: This loop only watches for the sentinel file and does not inspect job status,
# so if the probe job fails immediately it will still run all max_polls before reporting
# failure. Consider checking job status here to bail out early on a FAILED job.
for attempt in range(1, max_polls + 1):
Comment thread
dadmobile marked this conversation as resolved.
check_res = api.get(check_url, timeout=60.0)
if check_res.status_code != 200:
console.print(f"[error]Error:[/error] Could not check probe status. {_extract_error_detail(check_res)}")
raise typer.Exit(1)

check_data = check_res.json()
last_path = check_data.get("path")
if check_data.get("found"):
found = True
break

if attempt < max_polls:
with console.status(
f"[bold]Waiting for sentinel file (attempt {attempt}/{max_polls})...[/bold]", spinner="dots"
):
time.sleep(poll_interval)

if cli_state.output_format == "json":
print(json.dumps({"job_id": job_id, "found": found, "path": last_path}))
if not found:
raise typer.Exit(1)
return

if found:
console.print(f"[success]✓[/success] Lifecycle verified — sentinel found in shared storage ({last_path}).")
else:
console.print(
f"[error]✗[/error] Lifecycle verification failed — sentinel file not found in shared storage "
f"after {max_polls} checks (job {job_id})."
)
raise typer.Exit(1)


@app.command("enable")
def command_provider_enable(
provider_id: str = typer.Argument(..., help="Provider ID to enable"),
Expand Down
44 changes: 44 additions & 0 deletions cli/tests/commands/test_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,50 @@ def test_provider_check_shows_reason_and_fails(_mock_check, _mock_api):
assert "Bad API key" in result.output


@patch(
"transformerlab_cli.commands.provider.api.get",
return_value=_mock_response(200, {"found": True, "path": "/ws/debug/storage-probe-7.txt"}),
)
@patch(
"transformerlab_cli.commands.provider.api.post_json",
return_value=_mock_response(200, {"job_id": 7, "experiment_id": "__storage_probe__"}),
)
@patch("transformerlab_cli.commands.provider.check_configs")
def test_provider_verify_lifecycle_passes(_mock_check, _mock_post, _mock_get):
"""Lifecycle verification passes when the sentinel file is found."""
result = runner.invoke(app, ["provider", "verify-lifecycle", "p1"])
assert result.exit_code == 0
assert "Lifecycle verified" in result.output


@patch(
"transformerlab_cli.commands.provider.api.get",
return_value=_mock_response(200, {"found": False, "path": "/ws/debug/storage-probe-7.txt"}),
)
@patch(
"transformerlab_cli.commands.provider.api.post_json",
return_value=_mock_response(200, {"job_id": 7, "experiment_id": "__storage_probe__"}),
)
@patch("transformerlab_cli.commands.provider.check_configs")
def test_provider_verify_lifecycle_fails_on_timeout(_mock_check, _mock_post, _mock_get):
"""Lifecycle verification fails and exits non-zero when the sentinel never appears."""
result = runner.invoke(app, ["provider", "verify-lifecycle", "p1", "--max-polls", "1"])
assert result.exit_code == 1
assert "Lifecycle verification failed" in result.output


@patch(
"transformerlab_cli.commands.provider.api.post_json",
return_value=_mock_response(200, {"job_id": 7, "experiment_id": "__storage_probe__"}),
)
@patch("transformerlab_cli.commands.provider.check_configs")
def test_provider_verify_lifecycle_no_wait(_mock_check, _mock_post):
"""--no-wait launches the probe and reports the job ID without polling."""
result = runner.invoke(app, ["provider", "verify-lifecycle", "p1", "--no-wait"])
assert result.exit_code == 0
assert "7" in result.output


@patch("transformerlab_cli.commands.provider.api.patch", return_value=_mock_response(200))
@patch("transformerlab_cli.commands.provider.check_configs")
def test_provider_set_default(_mock_check, mock_patch):
Expand Down
2 changes: 1 addition & 1 deletion cli/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading