Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issue where a DAG with exhausted retries would get stuck Running #1364

Merged
merged 1 commit into from
May 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions test/e2e/expectedfailures/dag-exhausted-retries.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
generateName: dag-exhausted-retries-
spec:
entrypoint: retry-with-dags
templates:
- name: retry-with-dags
dag:
tasks:
- name: success1
template: success
- name: sub-dag1
template: sub-dag
dependencies:
- success1
- name: success2
dependencies:
- sub-dag1
template: success

- name: sub-dag
dag:
tasks:
- name: fail
template: fail

- name: fail
container:
command: [sh, -c, exit 1]
image: alpine
retryStrategy:
limit: 1

- name: success
container:
command: [sh, -c, exit 0]
image: alpine
retryStrategy:
limit: 1
4 changes: 2 additions & 2 deletions test/e2e/functional/dag-with-retries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ spec:
outputs: {}
- name: randomly-fail
retryStrategy:
limit: 10
limit: 20
container:
image: alpine:latest
command: [sh, -c]
args: ["exit $(( ${RANDOM} % 3 ))"]
args: ["exit $(( ${RANDOM} % 5 ))"]
29 changes: 22 additions & 7 deletions workflow/controller/dag.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,16 @@ func (d *dagContext) assessDAGPhase(targetTasks []string, nodes map[string]wfv1.
if !node.Completed() {
return wfv1.NodeRunning
}
if !node.Successful() && unsuccessfulPhase == "" {
if node.Successful() {
continue
}
// failed retry attempts should not factor into the overall unsuccessful phase of the dag
// because the subsequent attempt may have succeeded
if unsuccessfulPhase == "" && !isRetryAttempt(node, nodes) {
unsuccessfulPhase = node.Phase
}
if node.Type == wfv1.NodeTypeRetry {
if node.Successful() {
retriesExhausted = false
} else if hasMoreRetries(&node, d.wf) {
retriesExhausted = false
}
if node.Type == wfv1.NodeTypeRetry && hasMoreRetries(&node, d.wf) {
retriesExhausted = false
}
}
if unsuccessfulPhase != "" {
Expand All @@ -107,6 +108,20 @@ func (d *dagContext) assessDAGPhase(targetTasks []string, nodes map[string]wfv1.
return wfv1.NodeSucceeded
}

// isRetryAttempt detects if a node is part of a retry
func isRetryAttempt(node wfv1.NodeStatus, nodes map[string]wfv1.NodeStatus) bool {
for _, potentialParent := range nodes {
if potentialParent.Type == wfv1.NodeTypeRetry {
for _, child := range potentialParent.Children {
if child == node.ID {
return true
}
}
}
}
return false
}

func hasMoreRetries(node *wfv1.NodeStatus, wf *wfv1.Workflow) bool {
if node.Phase == wfv1.NodeSucceeded {
return false
Expand Down
8 changes: 8 additions & 0 deletions workflow/controller/dag_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,11 @@ func TestDagRetrySucceeded(t *testing.T) {
woc.operate()
assert.Equal(t, string(wfv1.NodeSucceeded), string(woc.wf.Status.Phase))
}

// TestDagRetryExhaustedXfail verifies we fail properly when we exhaust our retries
func TestDagRetryExhaustedXfail(t *testing.T) {
wf := test.LoadTestWorkflow("testdata/dag-exhausted-retries-xfail.yaml")
woc := newWoc(*wf)
woc.operate()
assert.Equal(t, string(wfv1.NodeFailed), string(woc.wf.Status.Phase))
}
159 changes: 159 additions & 0 deletions workflow/controller/testdata/dag-exhausted-retries-xfail.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
creationTimestamp: "2019-05-10T01:04:45Z"
generateName: retry-with-dags-
generation: 12
labels:
workflows.argoproj.io/phase: Running
name: retry-with-dags-52zdr
namespace: default
resourceVersion: "1171"
selfLink: /apis/argoproj.io/v1alpha1/namespaces/default/workflows/retry-with-dags-52zdr
uid: 99aeb0ac-72bf-11e9-823d-d6003286b4aa
spec:
arguments: {}
entrypoint: retry-with-dags
templates:
- dag:
tasks:
- arguments: {}
name: success1
template: success
- arguments: {}
dependencies:
- success1
name: sub-dag1
template: sub-dag
- arguments: {}
dependencies:
- sub-dag1
name: success2
template: success
inputs: {}
metadata: {}
name: retry-with-dags
outputs: {}
- dag:
tasks:
- arguments: {}
name: fail
template: fail
inputs: {}
metadata: {}
name: sub-dag
outputs: {}
- container:
command:
- sh
- -c
- exit 1
image: alpine
name: ""
resources: {}
inputs: {}
metadata: {}
name: fail
outputs: {}
retryStrategy:
limit: 1
- container:
command:
- sh
- -c
- exit 0
image: alpine
name: ""
resources: {}
inputs: {}
metadata: {}
name: success
outputs: {}
retryStrategy:
limit: 1
status:
finishedAt: null
nodes:
retry-with-dags-52zdr:
children:
- retry-with-dags-52zdr-1651444810
displayName: retry-with-dags-52zdr
finishedAt: null
id: retry-with-dags-52zdr
name: retry-with-dags-52zdr
phase: Running
startedAt: "2019-05-10T01:05:11Z"
templateName: retry-with-dags
type: DAG
retry-with-dags-52zdr-139017483:
boundaryID: retry-with-dags-52zdr-1777128546
displayName: fail(0)
finishedAt: "2019-05-10T01:05:18Z"
id: retry-with-dags-52zdr-139017483
message: failed with exit code 1
name: retry-with-dags-52zdr.sub-dag1.fail(0)
phase: Failed
startedAt: "2019-05-10T01:05:16Z"
templateName: fail
type: Pod
retry-with-dags-52zdr-743158862:
boundaryID: retry-with-dags-52zdr-1777128546
displayName: fail(1)
finishedAt: "2019-05-10T01:05:23Z"
id: retry-with-dags-52zdr-743158862
message: failed with exit code 1
name: retry-with-dags-52zdr.sub-dag1.fail(1)
phase: Failed
startedAt: "2019-05-10T01:05:20Z"
templateName: fail
type: Pod
retry-with-dags-52zdr-1617789624:
boundaryID: retry-with-dags-52zdr-1777128546
children:
- retry-with-dags-52zdr-139017483
- retry-with-dags-52zdr-743158862
displayName: fail
finishedAt: "2019-05-10T01:05:24Z"
id: retry-with-dags-52zdr-1617789624
message: No more retries left
name: retry-with-dags-52zdr.sub-dag1.fail
phase: Failed
startedAt: "2019-05-10T01:05:16Z"
type: Retry
retry-with-dags-52zdr-1651444810:
boundaryID: retry-with-dags-52zdr
children:
- retry-with-dags-52zdr-2712951537
displayName: success1
finishedAt: "2019-05-10T01:05:15Z"
id: retry-with-dags-52zdr-1651444810
name: retry-with-dags-52zdr.success1
phase: Succeeded
startedAt: "2019-05-10T01:05:11Z"
type: Retry
retry-with-dags-52zdr-1777128546:
boundaryID: retry-with-dags-52zdr
children:
- retry-with-dags-52zdr-1617789624
displayName: sub-dag1
finishedAt: "2019-05-10T01:05:24Z"
id: retry-with-dags-52zdr-1777128546
name: retry-with-dags-52zdr.sub-dag1
phase: Failed
startedAt: "2019-05-10T01:05:16Z"
templateName: sub-dag
type: DAG
retry-with-dags-52zdr-2712951537:
boundaryID: retry-with-dags-52zdr
children:
- retry-with-dags-52zdr-1777128546
displayName: success1(0)
finishedAt: "2019-05-10T01:05:13Z"
id: retry-with-dags-52zdr-2712951537
name: retry-with-dags-52zdr.success1(0)
phase: Succeeded
startedAt: "2019-05-10T01:05:11Z"
templateName: success
type: Pod
phase: Running
startedAt: "2019-05-10T01:05:11Z"