Skip to content

Commit 70f30c6

Browse files
jorgeepditommaso
andcommitted
Get exit code from pod to manage OOM in k8s (#6442)
Signed-off-by: jorgee <jorge.ejarque@seqera.io> Signed-off-by: Paolo Di Tommaso <paolo.ditommaso@gmail.com> Co-authored-by: Paolo Di Tommaso <paolo.ditommaso@gmail.com>
1 parent 0ccdee9 commit 70f30c6

File tree

2 files changed

+35
-1
lines changed

2 files changed

+35
-1
lines changed

plugins/nf-k8s/src/main/nextflow/k8s/K8sTaskHandler.groovy

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,15 @@ class K8sTaskHandler extends TaskHandler implements FusionAwareTask {
423423
}
424424
else {
425425
// finalize the task
426-
task.exitStatus = readExitFile()
426+
// read the exit code from the K8s container terminated state, if 0 (successful) or missing
427+
// take the exit code from the `.exitcode` file created by nextflow
428+
// the rationale is that in case of error (e.g. OOMKilled, pod eviction), the exit code from
429+
// the K8s API is more reliable because the container may terminate before the exit file is written
430+
// See https://github.com/nextflow-io/nextflow/issues/6436
431+
// https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.30/#containerstateterminated-v1-core
432+
log.trace("[k8s] Container Terminated state ${state.terminated}")
433+
final k8sExitCode = (state.terminated as Map)?.exitCode as Integer
434+
task.exitStatus = k8sExitCode ?: readExitFile()
427435
task.stdout = outputFile
428436
task.stderr = errorFile
429437
}

plugins/nf-k8s/src/test/nextflow/k8s/K8sTaskHandlerTest.groovy

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,32 @@ class K8sTaskHandlerTest extends Specification {
509509

510510
}
511511

512+
def 'should use K8s exit code when available' () {
513+
given:
514+
def ERR_FILE = Paths.get('err.file')
515+
def OUT_FILE = Paths.get('out.file')
516+
def POD_NAME = 'pod-xyz'
517+
def client = Mock(K8sClient)
518+
def termState = [ reason: "Error",
519+
startedAt: "2018-01-13T10:09:36Z",
520+
finishedAt: "2018-01-13T10:19:36Z",
521+
exitCode: 137 ]
522+
def task = new TaskRun()
523+
def handler = Spy(new K8sTaskHandler(task: task, client:client, podName: POD_NAME, outputFile: OUT_FILE, errorFile: ERR_FILE))
524+
525+
when:
526+
def result = handler.checkIfCompleted()
527+
then:
528+
1 * handler.getState() >> [terminated: termState]
529+
1 * handler.updateTimestamps(termState)
530+
0 * handler.readExitFile()
531+
1 * handler.deletePodIfSuccessful(task) >> null
532+
1 * handler.savePodLogOnError(task) >> null
533+
handler.task.exitStatus == 137
534+
handler.status == TaskStatus.COMPLETED
535+
result == true
536+
}
537+
512538
def 'should kill a pod' () {
513539
given:
514540
def POD_NAME = 'pod-xyz'

0 commit comments

Comments
 (0)