Skip to content

Commit

Permalink
Add examples for three existing failure policy actions. (#601)
Browse files Browse the repository at this point in the history
* Add examples for three existing failure policy actions.

Add examples for each of the following failure policy actions:
1. FailJobSet,
2. RestartJobSet,
3. RestartJobSetAndIgnoreMaxRestarts.

* Add example for configurable failure policy using a rule with onJobFailureReasons present.

* Correct the name of the jobset in 'examples/failure-policy/onjobfailurereasons-present.yaml'.

* Add example using onJobFailureReasons with the selected reason being PodFailurePolicy.

* Add example similar to a host maintenance event.

* Add short descriptions of expected behavior in examples.

* Fix grammatical error.

* Add commment describing host maintenance example.
  • Loading branch information
jedwins1998 authored Sep 20, 2024
1 parent ef7f910 commit 665bc42
Show file tree
Hide file tree
Showing 6 changed files with 397 additions and 0 deletions.
61 changes: 61 additions & 0 deletions examples/failure-policy/failjobset-action.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
apiVersion: jobset.x-k8s.io/v1alpha2
kind: JobSet
metadata:
name: failjobset-action-example
spec:
failurePolicy:
maxRestarts: 3
rules:
# The JobSet will fail immediately when the leader job fails.
- action: FailJobSet
targetReplicatedJobs:
- leader
replicatedJobs:
- name: leader
replicas: 1
template:
spec:
# Set backoff limit to 0 so job will immediately fail if any pod fails.
backoffLimit: 0
completions: 2
parallelism: 2
template:
spec:
containers:
- name: leader
image: bash:latest
command:
- bash
- -xc
- |
echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX"
if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
for i in $(seq 10 -1 1)
do
echo "Sleeping in $i"
sleep 1
done
exit 1
fi
for i in $(seq 1 1000)
do
echo "$i"
sleep 1
done
- name: workers
replicas: 1
template:
spec:
backoffLimit: 0
completions: 2
parallelism: 2
template:
spec:
containers:
- name: worker
image: bash:latest
command:
- bash
- -xc
- |
sleep 1000
74 changes: 74 additions & 0 deletions examples/failure-policy/host-maintenance-event-model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
apiVersion: jobset.x-k8s.io/v1alpha2
kind: JobSet
metadata:
name: host-maintenance-event-model
spec:
failurePolicy:
maxRestarts: 0
rules:
# The JobSet will restart an unlimited number of times when failure matches the pod failure policy.
- action: RestartJobSetAndIgnoreMaxRestarts
onJobFailureReasons:
- PodFailurePolicy
# The JobSet is restarted as normal when the leader job fails and the above rule is not matched.
- action: RestartJobSet
targetReplicatedJobs:
- leader
replicatedJobs:
- name: leader
replicas: 1
template:
spec:
# Set backoff limit to 0 so job will immediately fail if any pod fails.
backoffLimit: 0
completions: 2
parallelism: 2
template:
spec:
restartPolicy: Never
containers:
- name: leader
image: bash:latest
command:
- bash
- -xc
- |
echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX"
if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
for i in $(seq 120 -1 1)
do
echo "Sleeping in $i"
sleep 1
done
exit 1
fi
for i in $(seq 1 1000)
do
echo "$i"
sleep 1
done
# This failure policy is triggered when a node undergoes host maintenace.
# In such a case, the pods are evicted and the job will fail with a condition
# of type DisruptionTarget.
podFailurePolicy:
rules:
- action: FailJob
onPodConditions:
- type: DisruptionTarget
- name: workers
replicas: 1
template:
spec:
backoffLimit: 0
completions: 2
parallelism: 2
template:
spec:
containers:
- name: worker
image: bash:latest
command:
- bash
- -xc
- |
sleep 1000
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
apiVersion: jobset.x-k8s.io/v1alpha2
kind: JobSet
metadata:
name: onjobfailurereasons-podfailurepolicy-example
spec:
failurePolicy:
maxRestarts: 3
rules:
# The JobSet will restart an unlimited number of times
# when the leader job fails with a failure reason matching
# the pod failure policy.
- action: RestartJobSetAndIgnoreMaxRestarts
targetReplicatedJobs:
- leader
onJobFailureReasons:
- PodFailurePolicy
replicatedJobs:
- name: leader
replicas: 1
template:
spec:
# Set backoff limit to 0 so job will immediately fail if any pod fails.
backoffLimit: 0
completions: 2
parallelism: 2
template:
spec:
restartPolicy: Never
containers:
- name: leader
image: bash:latest
command:
- bash
- -xc
- |
echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX"
if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
for i in $(seq 10 -1 1)
do
echo "Sleeping in $i"
sleep 1
done
exit 1
fi
for i in $(seq 1 1000)
do
echo "$i"
sleep 1
done
podFailurePolicy:
rules:
- action: FailJob
onPodConditions: []
onExitCodes:
containerName: leader
operator: In
values: [1]
- name: workers
replicas: 1
template:
spec:
backoffLimit: 0
completions: 2
parallelism: 2
template:
spec:
containers:
- name: worker
image: bash:latest
command:
- bash
- -xc
- |
sleep 1000
64 changes: 64 additions & 0 deletions examples/failure-policy/onjobfailurereasons-present.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
apiVersion: jobset.x-k8s.io/v1alpha2
kind: JobSet
metadata:
name: onjobfailurereasons-present-example
spec:
failurePolicy:
maxRestarts: 3
rules:
# The JobSet will restart an unlimited number of times when the
# leader job fails with the failure reason BackoffLimitExceeded.
- action: RestartJobSetAndIgnoreMaxRestarts
targetReplicatedJobs:
- leader
onJobFailureReasons:
- BackoffLimitExceeded
replicatedJobs:
- name: leader
replicas: 1
template:
spec:
# Set backoff limit to 0 so job will immediately fail if any pod fails.
backoffLimit: 0
completions: 2
parallelism: 2
template:
spec:
containers:
- name: leader
image: bash:latest
command:
- bash
- -xc
- |
echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX"
if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
for i in $(seq 10 -1 1)
do
echo "Sleeping in $i"
sleep 1
done
exit 1
fi
for i in $(seq 1 1000)
do
echo "$i"
sleep 1
done
- name: workers
replicas: 1
template:
spec:
backoffLimit: 0
completions: 2
parallelism: 2
template:
spec:
containers:
- name: worker
image: bash:latest
command:
- bash
- -xc
- |
sleep 1000
61 changes: 61 additions & 0 deletions examples/failure-policy/restartjobset-action.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
apiVersion: jobset.x-k8s.io/v1alpha2
kind: JobSet
metadata:
name: restartjobset-action-example
spec:
failurePolicy:
maxRestarts: 3
rules:
# The JobSet will restart when the leader job fails.
- action: RestartJobSet
targetReplicatedJobs:
- leader
replicatedJobs:
- name: leader
replicas: 1
template:
spec:
# Set backoff limit to 0 so job will immediately fail if any pod fails.
backoffLimit: 0
completions: 2
parallelism: 2
template:
spec:
containers:
- name: leader
image: bash:latest
command:
- bash
- -xc
- |
echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX"
if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
for i in $(seq 10 -1 1)
do
echo "Sleeping in $i"
sleep 1
done
exit 1
fi
for i in $(seq 1 1000)
do
echo "$i"
sleep 1
done
- name: workers
replicas: 1
template:
spec:
backoffLimit: 0
completions: 2
parallelism: 2
template:
spec:
containers:
- name: worker
image: bash:latest
command:
- bash
- -xc
- |
sleep 1000
Loading

0 comments on commit 665bc42

Please sign in to comment.