@@ -325,56 +325,45 @@ def duration(self) -> timedelta:
325
325
end_time = self .finished_at
326
326
return end_time - self .submitted_at
327
327
328
- @root_validator
329
- def _status_message (cls , values ) -> Dict :
330
- try :
331
- status = values ["status" ]
332
- termination_reason = values ["termination_reason" ]
333
- exit_code = values ["exit_status" ]
334
- except KeyError :
335
- return values
336
- values ["status_message" ] = JobSubmission ._get_status_message (
337
- status = status ,
338
- termination_reason = termination_reason ,
339
- exit_status = exit_code ,
340
- )
341
- return values
328
+ def dict (self , * args , ** kwargs ) -> Dict :
329
+ status_message = self ._get_status_message ()
330
+ error = self ._get_error ()
331
+ # super() does not work with pydantic-duality
332
+ res = CoreModel .dict (self , * args , ** kwargs )
333
+ res ["status_message" ] = status_message
334
+ res ["error" ] = error
335
+ return res
342
336
343
- @staticmethod
344
- def _get_status_message (
345
- status : JobStatus ,
346
- termination_reason : Optional [JobTerminationReason ],
347
- exit_status : Optional [int ],
348
- ) -> str :
349
- if status == JobStatus .DONE :
337
+ def _get_status_message (self ) -> Optional [str ]:
338
+ if self .status == JobStatus .DONE :
350
339
return "exited (0)"
351
- elif status == JobStatus .FAILED :
352
- if termination_reason == JobTerminationReason .CONTAINER_EXITED_WITH_ERROR :
353
- return f"exited ({ exit_status } )"
354
- elif termination_reason == JobTerminationReason .FAILED_TO_START_DUE_TO_NO_CAPACITY :
340
+ elif self .status == JobStatus .FAILED :
341
+ if self .termination_reason == JobTerminationReason .CONTAINER_EXITED_WITH_ERROR :
342
+ return f"exited ({ self .exit_status } )"
343
+ elif (
344
+ self .termination_reason == JobTerminationReason .FAILED_TO_START_DUE_TO_NO_CAPACITY
345
+ ):
355
346
return "no offers"
356
- elif termination_reason == JobTerminationReason .INTERRUPTED_BY_NO_CAPACITY :
347
+ elif self . termination_reason == JobTerminationReason .INTERRUPTED_BY_NO_CAPACITY :
357
348
return "interrupted"
358
349
else :
359
350
return "error"
360
- elif status == JobStatus .TERMINATED :
361
- if termination_reason == JobTerminationReason .TERMINATED_BY_USER :
351
+ elif self . status == JobStatus .TERMINATED :
352
+ if self . termination_reason == JobTerminationReason .TERMINATED_BY_USER :
362
353
return "stopped"
363
- elif termination_reason == JobTerminationReason .ABORTED_BY_USER :
354
+ elif self . termination_reason == JobTerminationReason .ABORTED_BY_USER :
364
355
return "aborted"
365
- return status .value
356
+ return self . status .value
366
357
367
- @root_validator
368
- def _error (cls , values ) -> Dict :
369
- try :
370
- termination_reason = values ["termination_reason" ]
371
- except KeyError :
372
- return values
373
- values ["error" ] = JobSubmission ._get_error (termination_reason = termination_reason )
374
- return values
358
+ def _get_error (self ) -> Optional [str ]:
359
+ return JobSubmission ._termination_reason_to_error (
360
+ termination_reason = self .termination_reason
361
+ )
375
362
376
363
@staticmethod
377
- def _get_error (termination_reason : Optional [JobTerminationReason ]) -> Optional [str ]:
364
+ def _termination_reason_to_error (
365
+ termination_reason : Optional [JobTerminationReason ],
366
+ ) -> Optional [str ]:
378
367
error_mapping = {
379
368
JobTerminationReason .INSTANCE_UNREACHABLE : "instance unreachable" ,
380
369
JobTerminationReason .WAITING_INSTANCE_LIMIT_EXCEEDED : "waiting instance limit exceeded" ,
@@ -395,6 +384,12 @@ class Job(CoreModel):
395
384
job_spec : JobSpec
396
385
job_submissions : List [JobSubmission ]
397
386
387
+ def get_last_termination_reason (self ) -> Optional [JobTerminationReason ]:
388
+ for submission in reversed (self .job_submissions ):
389
+ if submission .termination_reason is not None :
390
+ return submission .termination_reason
391
+ return None
392
+
398
393
399
394
class RunSpec (CoreModel ):
400
395
# TODO: run_name, working_dir are redundant here since they already passed in configuration
@@ -525,87 +520,70 @@ class Run(CoreModel):
525
520
last_processed_at : datetime
526
521
status : RunStatus
527
522
status_message : Optional [str ] = None
528
- termination_reason : Optional [RunTerminationReason ]
523
+ termination_reason : Optional [RunTerminationReason ] = None
529
524
run_spec : RunSpec
530
525
jobs : List [Job ]
531
- latest_job_submission : Optional [JobSubmission ]
526
+ latest_job_submission : Optional [JobSubmission ] = None
532
527
cost : float = 0
533
528
service : Optional [ServiceSpec ] = None
534
529
deployment_num : int = 0 # default for compatibility with pre-0.19.14 servers
535
530
# TODO: make error a computed field after migrating to pydanticV2
536
531
error : Optional [str ] = None
537
532
deleted : Optional [bool ] = None
538
533
539
- @root_validator
540
- def _error (cls , values ) -> Dict :
541
- try :
542
- termination_reason = values ["termination_reason" ]
543
- except KeyError :
544
- return values
545
- values ["error" ] = Run ._get_error (termination_reason = termination_reason )
546
- return values
534
+ def dict (self , * args , ** kwargs ) -> Dict :
535
+ status_message = self ._get_status_message ()
536
+ error = self ._get_error ()
537
+ # super() does not work with pydantic-duality
538
+ res = CoreModel .dict (self , * args , ** kwargs )
539
+ res ["status_message" ] = status_message
540
+ res ["error" ] = error
541
+ return res
542
+
543
+ def _get_error (self ) -> Optional [str ]:
544
+ return Run ._termination_reason_to_error (termination_reason = self .termination_reason )
547
545
548
546
@staticmethod
549
- def _get_error (termination_reason : Optional [RunTerminationReason ]) -> Optional [str ]:
547
+ def _termination_reason_to_error (
548
+ termination_reason : Optional [RunTerminationReason ],
549
+ ) -> Optional [str ]:
550
550
if termination_reason == RunTerminationReason .RETRY_LIMIT_EXCEEDED :
551
551
return "retry limit exceeded"
552
552
elif termination_reason == RunTerminationReason .SERVER_ERROR :
553
553
return "server error"
554
554
else :
555
555
return None
556
556
557
- @root_validator
558
- def _status_message (cls , values ) -> Dict :
557
+ def _get_status_message (self ) -> Optional [str ]:
558
+ if len (self .jobs ) == 0 :
559
+ return self .status .value
560
+
561
+ last_job = self .jobs [0 ]
559
562
# FIXME: status_message should not require all job submissions for status calculation
560
563
# since it's very expensive and is not required for anything else.
561
564
# May return a different status if not all job submissions requested.
562
565
# TODO: Calculate status_message by looking at job models directly instead job submissions.
563
- try :
564
- status = values ["status" ]
565
- jobs : List [Job ] = values ["jobs" ]
566
- retry_on_events = (
567
- jobs [0 ].job_spec .retry .on_events if jobs and jobs [0 ].job_spec .retry else []
568
- )
569
- job_status = (
570
- jobs [0 ].job_submissions [- 1 ].status
571
- if len (jobs ) == 1 and jobs [0 ].job_submissions
572
- else None
573
- )
574
- termination_reason = Run .get_last_termination_reason (jobs [0 ]) if jobs else None
575
- except KeyError :
576
- return values
577
- values ["status_message" ] = Run ._get_status_message (
578
- status = status ,
579
- job_status = job_status ,
580
- retry_on_events = retry_on_events ,
581
- termination_reason = termination_reason ,
582
- )
583
- return values
566
+ last_job_termination_reason = last_job .get_last_termination_reason ()
584
567
585
- @staticmethod
586
- def get_last_termination_reason (job : "Job" ) -> Optional [JobTerminationReason ]:
587
- for submission in reversed (job .job_submissions ):
588
- if submission .termination_reason is not None :
589
- return submission .termination_reason
590
- return None
568
+ if len (self .jobs ) == 1 :
569
+ # FIXME: Clarify why show "pulling" only in case of one job
570
+ if (
571
+ last_job .job_submissions
572
+ and last_job .job_submissions [- 1 ].status == JobStatus .PULLING
573
+ ):
574
+ return "pulling"
591
575
592
- @staticmethod
593
- def _get_status_message (
594
- status : RunStatus ,
595
- job_status : Optional [JobStatus ],
596
- retry_on_events : List [RetryEvent ],
597
- termination_reason : Optional [JobTerminationReason ],
598
- ) -> str :
599
- if job_status == JobStatus .PULLING :
600
- return "pulling"
576
+ retry_on_events = last_job .job_spec .retry .on_events if last_job .job_spec .retry else []
601
577
# Currently, `retrying` is shown only for `no-capacity` events
602
578
if (
603
- status in [RunStatus .SUBMITTED , RunStatus .PENDING ]
604
- and termination_reason == JobTerminationReason .FAILED_TO_START_DUE_TO_NO_CAPACITY
579
+ self .status in [RunStatus .SUBMITTED , RunStatus .PENDING ]
580
+ and last_job_termination_reason
581
+ == JobTerminationReason .FAILED_TO_START_DUE_TO_NO_CAPACITY
605
582
and RetryEvent .NO_CAPACITY in retry_on_events
606
583
):
607
584
return "retrying"
608
- return status .value
585
+
586
+ return self .status .value
609
587
610
588
def is_deployment_in_progress (self ) -> bool :
611
589
return any (
0 commit comments