Skip to content

[UX] Show status message as retrying in case a run or job is being retired #2758

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions frontend/src/libs/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@ import { get as _get } from 'lodash';
import { StatusIndicatorProps } from '@cloudscape-design/components';

import { capitalize } from 'libs';
import { finishedRunStatuses } from '../pages/Runs/constants';

import { IModelExtended } from '../pages/Models/List/types';

export const getStatusIconType = (
status: IRun['status'] | TJobStatus,
terminationReason: string | null | undefined,
): StatusIndicatorProps['type'] => {
if (terminationReason === 'interrupted_by_no_capacity') {
if (finishedRunStatuses.includes(status) && terminationReason === 'interrupted_by_no_capacity') {
return 'stopped';
}
switch (status) {
Expand Down Expand Up @@ -41,24 +42,26 @@ export const getStatusIconColor = (
if (terminationReason === 'failed_to_start_due_to_no_capacity' || terminationReason === 'interrupted_by_no_capacity') {
return 'yellow';
}

switch (status) {
case 'submitted':
case 'pending':
return 'blue';
case 'pulling':
return 'green';
case 'aborted':
return 'yellow';
case 'done':
return 'blue';
return 'grey';
default:
return undefined;
}
};

export const getRunStatusMessage = (run: IRun): string => {
if (run.latest_job_submission?.status_message) {
if (finishedRunStatuses.includes(run.status) && run.latest_job_submission?.status_message) {
return capitalize(run.latest_job_submission.status_message);
} else {
return capitalize(run.status);
return capitalize(run.status_message || run.status);
}
};

Expand Down
5 changes: 3 additions & 2 deletions frontend/src/pages/Runs/Details/RunDetails/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import { Logs } from '../Logs';
import { getJobSubmissionId } from '../Logs/helpers';

import styles from './styles.module.scss';
import { finishedRunStatuses } from 'pages/Runs/constants';

export const RunDetails = () => {
const { t } = useTranslation();
Expand All @@ -47,8 +48,8 @@ export const RunDetails = () => {

if (!runData) return null;

const status = runData.latest_job_submission?.status ?? runData.status;
const terminationReason = runData.latest_job_submission?.termination_reason;
const status = finishedRunStatuses.includes(runData.status) ? runData.latest_job_submission?.status ?? runData.status : runData.status;
const terminationReason = finishedRunStatuses.includes(runData.status) ? runData.latest_job_submission?.termination_reason : null;

return (
<>
Expand Down
5 changes: 3 additions & 2 deletions frontend/src/pages/Runs/List/hooks/useColumnsDefinitions.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import {
getRunListItemResources,
getRunListItemSpotLabelKey,
} from '../helpers';
import { finishedRunStatuses } from 'pages/Runs/constants';

export const useColumnsDefinitions = () => {
const { t } = useTranslation();
Expand Down Expand Up @@ -65,8 +66,8 @@ export const useColumnsDefinitions = () => {
id: 'status',
header: t('projects.run.status'),
cell: (item: IRun) => {
const status = item.latest_job_submission?.status ?? item.status;
const terminationReason = item.latest_job_submission?.termination_reason;
const status = finishedRunStatuses.includes(item.status) ? item.latest_job_submission?.status ?? item.status : item.status;
const terminationReason = finishedRunStatuses.includes(item.status) ? item.latest_job_submission?.termination_reason : null;

return (
<StatusIndicator
Expand Down
2 changes: 2 additions & 0 deletions frontend/src/pages/Runs/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ export const runStatusForStopping: TJobStatus[] = ['submitted', 'provisioning',
export const runStatusForAborting: TJobStatus[] = ['submitted', 'provisioning', 'pulling', 'pending', 'running'];
export const unfinishedRuns: TJobStatus[] = ['running', 'terminating', 'pending'];
export const finishedJobs: TJobStatus[] = ['terminated', 'aborted', 'failed', 'done'];
// TODO: Replace TJobStatus with TRunStatus and remove all consts above
export const finishedRunStatuses: TJobStatus[] = ['done', 'failed', 'terminated'];
1 change: 1 addition & 0 deletions frontend/src/types/run.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ declare interface IRun {
latest_job_submission?: IJobSubmission;
cost: number;
service: IRunService | null;
status_message?: string | null;
}

declare interface IMetricsItem {
Expand Down
10 changes: 5 additions & 5 deletions src/dstack/_internal/cli/utils/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,15 @@ def get_runs_table(
run_row: Dict[Union[str, int], Any] = {
"NAME": run.run_spec.run_name,
"SUBMITTED": format_date(run.submitted_at),
"STATUS": (
run.latest_job_submission.status_message
if run.status.is_finished() and run.latest_job_submission
else run.status_message
),
}
if run.error:
run_row["ERROR"] = run.error
if len(run.jobs) != 1:
run_row["STATUS"] = (
run.latest_job_submission.status_message
if run.latest_job_submission
else run.status
)
add_row_from_dict(table, run_row)

for job in run.jobs:
Expand Down
44 changes: 44 additions & 0 deletions src/dstack/_internal/core/models/runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,7 @@ class Run(CoreModel):
submitted_at: datetime
last_processed_at: datetime
status: RunStatus
status_message: Optional[str] = None
termination_reason: Optional[RunTerminationReason]
run_spec: RunSpec
jobs: List[Job]
Expand Down Expand Up @@ -524,6 +525,49 @@ def _get_error(termination_reason: Optional[RunTerminationReason]) -> Optional[s
else:
return None

@root_validator
def _status_message(cls, values) -> Dict:
try:
status = values["status"]
run_spec: RunSpec = values["run_spec"]
retry_on_events = (
run_spec.configuration.retry.on_events
if run_spec and run_spec.configuration.retry
else []
)
jobs = values["jobs"]
termination_reason = Run.get_last_termination_reason(jobs[0]) if jobs else None
except KeyError:
return values
values["status_message"] = Run._get_status_message(
status=status,
retry_on_events=retry_on_events,
termination_reason=termination_reason,
)
return values

@staticmethod
def get_last_termination_reason(job: "Job") -> Optional[JobTerminationReason]:
for submission in reversed(job.job_submissions):
if submission.termination_reason is not None:
return submission.termination_reason
return None

@staticmethod
def _get_status_message(
status: RunStatus,
retry_on_events: List[RetryEvent],
termination_reason: Optional[JobTerminationReason],
) -> str:
# Currently, `retrying` is shown only for `no-capacity` events
if (
status in [RunStatus.SUBMITTED, RunStatus.PENDING]
and termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
and RetryEvent.NO_CAPACITY in retry_on_events
):
return "retrying"
return status.value


class JobPlan(CoreModel):
job_spec: JobSpec
Expand Down
1 change: 1 addition & 0 deletions src/dstack/api/server/_runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def _get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
current_resource = plan.current_resource
if current_resource is not None:
current_resource_excludes = {}
current_resource_excludes["status_message"] = True
apply_plan_excludes["current_resource"] = current_resource_excludes
current_resource_excludes["run_spec"] = _get_run_spec_excludes(current_resource.run_spec)
job_submissions_excludes = {}
Expand Down
3 changes: 3 additions & 0 deletions src/tests/_internal/server/routers/test_runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ def get_dev_env_run_dict(
"submitted_at": submitted_at,
"last_processed_at": last_processed_at,
"status": "submitted",
"status_message": "submitted",
"run_spec": {
"configuration": {
"entrypoint": None,
Expand Down Expand Up @@ -510,6 +511,7 @@ async def test_lists_runs(self, test_db, session: AsyncSession, client: AsyncCli
"submitted_at": run1_submitted_at.isoformat(),
"last_processed_at": run1_submitted_at.isoformat(),
"status": "submitted",
"status_message": "submitted",
"run_spec": run1_spec.dict(),
"jobs": [
{
Expand Down Expand Up @@ -563,6 +565,7 @@ async def test_lists_runs(self, test_db, session: AsyncSession, client: AsyncCli
"submitted_at": run2_submitted_at.isoformat(),
"last_processed_at": run2_submitted_at.isoformat(),
"status": "submitted",
"status_message": "submitted",
"run_spec": run2_spec.dict(),
"jobs": [],
"latest_job_submission": None,
Expand Down
Loading