Skip to content

Commit ea1aa48

Browse files
committed
Get exit code from google API
Signed-off-by: jorgee <jorge.ejarque@seqera.io>
1 parent 84e3e78 commit ea1aa48

File tree

2 files changed

+107
-8
lines changed

2 files changed

+107
-8
lines changed

plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
545545
if( state in COMPLETED ) {
546546
log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - terminated job=$jobId; task=$taskId; state=$state"
547547
// finalize the task
548-
task.exitStatus = readExitFile()
548+
task.exitStatus = getExitCode()
549549
if( state == 'FAILED' ) {
550550
if( task.exitStatus == Integer.MAX_VALUE )
551551
task.error = getJobError()
@@ -565,6 +565,29 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
565565
return false
566566
}
567567

568+
/**
569+
* Try to get the latest exit code form the task status events list.
570+
* Fallback to read .exitcode file generated by Nextflow if not found (null).
571+
* The rationale of this is that, in case of error, the exit code return by the batch API is more reliable.
572+
*
573+
* @return exit code if found, otherwise Integer.MAX_VALUE
574+
*/
575+
private Integer getExitCode(){
576+
final events = client.getTaskStatus(jobId, taskId)?.getStatusEventsList()
577+
if( events ) {
578+
log.debug("[GOOGLE BATCH] Getting exit code from events: $events")
579+
final batchExitCode = events.stream().filter(ev -> ev.hasTaskExecution())
580+
.max( (ev1, ev2) -> Long.compare(ev1.getEventTime().seconds, ev2.getEventTime().seconds) )
581+
.map(ev -> ev.getTaskExecution().getExitCode())
582+
.orElse(null)
583+
if( batchExitCode != null && batchExitCode < 50000) // Ignore 500XX codes, they will be managed later.
584+
return batchExitCode
585+
}
586+
// fallback to read
587+
log.debug("[GOOGLE BATCH] Exit code not found from API. Checking .exitcode file...")
588+
return readExitFile()
589+
}
590+
568591
protected Throwable getJobError() {
569592
try {
570593
final events = noTaskJobfailure
@@ -574,7 +597,7 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
574597
log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - last event: ${lastEvent}; exit code: ${lastEvent?.taskExecution?.exitCode}"
575598

576599
final error = lastEvent?.description
577-
if( error && (EXIT_CODE_REGEX.matcher(error).find() || BATCH_ERROR_REGEX.matcher(error).find()) ) {
600+
if( error && (EXIT_CODE_REGEX.matcher(error).find() || BATCH_ERROR_REGEX.matcher(error).find()) || lastEvent?.taskExecution?.exitCode > 50000) {
578601
return new ProcessException(error)
579602
}
580603
}

plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy

Lines changed: 82 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ import com.google.api.gax.grpc.GrpcStatusCode
2121
import com.google.api.gax.rpc.NotFoundException
2222
import com.google.cloud.batch.v1.JobStatus
2323
import com.google.cloud.batch.v1.Task
24+
import com.google.cloud.batch.v1.TaskExecution
2425
import io.grpc.Status
26+
import nextflow.cloud.google.batch.logging.BatchLogging
27+
import nextflow.exception.ProcessException
2528

2629
import java.nio.file.Path
2730

@@ -473,15 +476,18 @@ class GoogleBatchTaskHandlerTest extends Specification {
473476

474477
}
475478

476-
TaskStatus makeTaskStatus(TaskStatus.State state, String desc) {
479+
TaskStatus makeTaskStatus(TaskStatus.State state, String desc, Integer exitCode = null) {
477480
def builder = TaskStatus.newBuilder()
478481
if (state)
479482
builder.setState(state)
480-
if (desc)
481-
builder.addStatusEvents(
482-
StatusEvent.newBuilder()
483-
.setDescription(desc)
484-
)
483+
if (desc || exitCode != null) {
484+
def statusBuilder = StatusEvent.newBuilder()
485+
if (desc)
486+
statusBuilder.setDescription(desc)
487+
if (exitCode != null)
488+
statusBuilder.setTaskExecution(TaskExecution.newBuilder().setExitCode(exitCode).build())
489+
builder.addStatusEvents(statusBuilder.build())
490+
}
485491
builder.build()
486492
}
487493

@@ -665,4 +671,74 @@ class GoogleBatchTaskHandlerTest extends Specification {
665671
.build()
666672

667673
}
674+
675+
def 'should check if completed from task status' () {
676+
given:
677+
def jobId = '1'
678+
def taskId = '1'
679+
def client = Mock(BatchClient){
680+
getTaskInArrayStatus(jobId, taskId) >> makeTaskStatus(STATE,"", EXIT_CODE)
681+
getTaskStatus(jobId, taskId) >> makeTaskStatus(STATE,"", EXIT_CODE)
682+
getJobStatus(jobId) >> makeJobStatus(JOB_STATUS,"")
683+
}
684+
def logging = Mock(BatchLogging)
685+
def executor = Mock(GoogleBatchExecutor){
686+
getLogging() >> logging
687+
}
688+
def task = new TaskRun()
689+
task.name = 'hello'
690+
def handler = Spy(new GoogleBatchTaskHandler(jobId: jobId, taskId: taskId, client: client, task: task, isArrayChild: ARRAY_CHILD, status: nextflow.processor.TaskStatus.RUNNING, executor: executor))
691+
when:
692+
def result = handler.checkIfCompleted()
693+
then:
694+
handler.status == TASK_STATUS
695+
handler.task.exitStatus == EXIT_STATUS
696+
result == RESULT
697+
698+
where:
699+
JOB_STATUS | STATE | EXIT_CODE | ARRAY_CHILD | TASK_STATUS | EXIT_STATUS | RESULT
700+
JobStatus.State.SUCCEEDED | TaskStatus.State.SUCCEEDED | 0 | true | nextflow.processor.TaskStatus.COMPLETED | 0 | true
701+
JobStatus.State.FAILED | TaskStatus.State.FAILED | 1 | true | nextflow.processor.TaskStatus.COMPLETED | 1 | true
702+
JobStatus.State.RUNNING | TaskStatus.State.RUNNING | null | true | nextflow.processor.TaskStatus.RUNNING | Integer.MAX_VALUE | false
703+
JobStatus.State.SUCCEEDED | TaskStatus.State.SUCCEEDED | 0 | false | nextflow.processor.TaskStatus.COMPLETED | 0 | true
704+
JobStatus.State.FAILED | TaskStatus.State.FAILED | 1 | false | nextflow.processor.TaskStatus.COMPLETED | 1 | true
705+
JobStatus.State.RUNNING | TaskStatus.State.RUNNING | null | false | nextflow.processor.TaskStatus.RUNNING | Integer.MAX_VALUE | false
706+
707+
}
708+
709+
def 'should check if completed from read file' () {
710+
given:
711+
def jobId = '1'
712+
def taskId = '1'
713+
def client = Mock(BatchClient){
714+
getTaskInArrayStatus(jobId, taskId) >> { TASK_STATE ? makeTaskStatus(TASK_STATE, DESC, EXIT_CODE): null }
715+
getTaskStatus(jobId, taskId) >> { TASK_STATE ? makeTaskStatus(TASK_STATE, DESC, EXIT_CODE): null }
716+
getJobStatus(jobId ) >> makeJobStatus(JobStatus.State.FAILED,DESC)
717+
}
718+
def logging = Mock(BatchLogging)
719+
def executor = Mock(GoogleBatchExecutor){
720+
getLogging() >> logging
721+
}
722+
def task = new TaskRun()
723+
task.name = 'hello'
724+
def handler = Spy(new GoogleBatchTaskHandler(jobId: jobId, taskId: taskId, client: client, task: task, isArrayChild: ARRAY_CHILD, status: nextflow.processor.TaskStatus.RUNNING, executor: executor))
725+
when:
726+
def result = handler.checkIfCompleted()
727+
then:
728+
1 * handler.readExitFile() >> EXIT_STATUS
729+
handler.status == TASK_STATUS
730+
handler.task.exitStatus == EXIT_STATUS
731+
handler.task.error?.message == TASK_ERROR
732+
result == RESULT
733+
734+
where:
735+
TASK_STATE | DESC | EXIT_CODE | ARRAY_CHILD | TASK_STATUS | EXIT_STATUS | RESULT | TASK_ERROR
736+
TaskStatus.State.FAILED | "Error" | null | true | nextflow.processor.TaskStatus.COMPLETED | 0 | true | null
737+
null | "Error" | null | true | nextflow.processor.TaskStatus.COMPLETED | 1 | true | null
738+
TaskStatus.State.FAILED | 'Task failed due to Spot VM preemption with exit code 50001.' | 50001 | true | nextflow.processor.TaskStatus.COMPLETED | Integer.MAX_VALUE | true | 'Task failed due to Spot VM preemption with exit code 50001.'
739+
TaskStatus.State.FAILED | "Error" | null | false | nextflow.processor.TaskStatus.COMPLETED | 0 | true | null
740+
null | "Error" | null | false | nextflow.processor.TaskStatus.COMPLETED | 1 | true | null
741+
TaskStatus.State.FAILED | 'Task failed due to Spot VM preemption with exit code 50001.' | 50001 | false | nextflow.processor.TaskStatus.COMPLETED | Integer.MAX_VALUE | true | 'Task failed due to Spot VM preemption with exit code 50001.'
742+
}
743+
668744
}

0 commit comments

Comments
 (0)