Add examples for three existing failure policy actions. (#601)

* Add examples for three existing failure policy actions. Add examples for each of the following failure policy actions: 1. FailJobSet, 2. RestartJobSet, 3. RestartJobSetAndIgnoreMaxRestarts. * Add example for configurable failure policy using a rule with onJobFailureReasons present. * Correct the name of the jobset in 'examples/failure-policy/onjobfailurereasons-present.yaml'. * Add example using onJobFailureReasons with the selected reason being PodFailurePolicy. * Add example similar to a host maintenance event. * Add short descriptions of expected behavior in examples. * Fix grammatical error. * Add commment describing host maintenance example.
kubernetes-sigs · Sep 20, 2024 · 665bc42 · 665bc42
1 parent ef7f910
commit 665bc42
Show file tree

Hide file tree

Showing 6 changed files with 397 additions and 0 deletions.
diff --git a/examples/failure-policy/failjobset-action.yaml b/examples/failure-policy/failjobset-action.yaml
@@ -0,0 +1,61 @@
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: failjobset-action-example
+spec:
+  failurePolicy:
+    maxRestarts: 3
+    rules:
+      # The JobSet will fail immediately when the leader job fails.
+      - action: FailJobSet
+        targetReplicatedJobs:
+        - leader
+  replicatedJobs:
+  - name: leader
+    replicas: 1
+    template:
+      spec:
+        # Set backoff limit to 0 so job will immediately fail if any pod fails.
+        backoffLimit: 0
+        completions: 2
+        parallelism: 2
+        template:
+          spec:
+            containers:
+            - name: leader
+              image: bash:latest
+              command:
+              - bash
+              - -xc
+              - |
+                echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX"
+                if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+                  for i in $(seq 10 -1 1)
+                  do
+                    echo "Sleeping in $i"
+                    sleep 1
+                  done
+                  exit 1
+                fi
+                for i in $(seq 1 1000)
+                do
+                  echo "$i"
+                  sleep 1
+                done
+  - name: workers
+    replicas: 1
+    template:
+      spec:
+        backoffLimit: 0
+        completions: 2
+        parallelism: 2
+        template:
+          spec:
+            containers:
+            - name: worker
+              image: bash:latest
+              command:
+              - bash
+              - -xc
+              - |
+                sleep 1000
diff --git a/examples/failure-policy/host-maintenance-event-model.yaml b/examples/failure-policy/host-maintenance-event-model.yaml
@@ -0,0 +1,74 @@
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: host-maintenance-event-model
+spec:
+  failurePolicy:
+    maxRestarts: 0
+    rules:
+      # The JobSet will restart an unlimited number of times when failure matches the pod failure policy.
+      - action: RestartJobSetAndIgnoreMaxRestarts
+        onJobFailureReasons:
+        - PodFailurePolicy
+      # The JobSet is restarted as normal when the leader job fails and the above rule is not matched.
+      - action: RestartJobSet
+        targetReplicatedJobs:
+        - leader
+  replicatedJobs:
+  - name: leader
+    replicas: 1
+    template:
+      spec:
+        # Set backoff limit to 0 so job will immediately fail if any pod fails.
+        backoffLimit: 0
+        completions: 2
+        parallelism: 2
+        template:
+          spec:
+            restartPolicy: Never
+            containers:
+            - name: leader
+              image: bash:latest
+              command:
+              - bash
+              - -xc
+              - |
+                echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX"
+                if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+                  for i in $(seq 120 -1 1)
+                  do
+                    echo "Sleeping in $i"
+                    sleep 1
+                  done
+                  exit 1
+                fi
+                for i in $(seq 1 1000)
+                do
+                  echo "$i"
+                  sleep 1
+                done
+        # This failure policy is triggered when a node undergoes host maintenace.
+        # In such a case, the pods are evicted and the job will fail with a condition
+        # of type DisruptionTarget.
+        podFailurePolicy:
+          rules:
+            - action: FailJob
+              onPodConditions: 
+              - type: DisruptionTarget
+  - name: workers
+    replicas: 1
+    template:
+      spec:
+        backoffLimit: 0
+        completions: 2
+        parallelism: 2
+        template:
+          spec:
+            containers:
+            - name: worker
+              image: bash:latest
+              command:
+              - bash
+              - -xc
+              - |
+                sleep 1000
diff --git a/examples/failure-policy/onjobfailurereasons-present-podfailurepolicy.yaml b/examples/failure-policy/onjobfailurereasons-present-podfailurepolicy.yaml
@@ -0,0 +1,74 @@
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: onjobfailurereasons-podfailurepolicy-example
+spec:
+  failurePolicy:
+    maxRestarts: 3
+    rules:
+      # The JobSet will restart an unlimited number of times
+      # when the leader job fails with a failure reason matching
+      # the pod failure policy.
+      - action: RestartJobSetAndIgnoreMaxRestarts 
+        targetReplicatedJobs:
+        - leader
+        onJobFailureReasons:
+        - PodFailurePolicy
+  replicatedJobs:
+  - name: leader
+    replicas: 1
+    template:
+      spec:
+        # Set backoff limit to 0 so job will immediately fail if any pod fails.
+        backoffLimit: 0
+        completions: 2
+        parallelism: 2
+        template:
+          spec:
+            restartPolicy: Never
+            containers:
+            - name: leader
+              image: bash:latest
+              command:
+              - bash
+              - -xc
+              - |
+                echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX"
+                if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+                  for i in $(seq 10 -1 1)
+                  do
+                    echo "Sleeping in $i"
+                    sleep 1
+                  done
+                  exit 1
+                fi
+                for i in $(seq 1 1000)
+                do
+                  echo "$i"
+                  sleep 1
+                done
+        podFailurePolicy:
+          rules:
+            - action: FailJob
+              onPodConditions: []
+              onExitCodes:
+                containerName: leader
+                operator: In
+                values: [1] 
+  - name: workers
+    replicas: 1
+    template:
+      spec:
+        backoffLimit: 0
+        completions: 2
+        parallelism: 2
+        template:
+          spec:
+            containers:
+            - name: worker
+              image: bash:latest
+              command:
+              - bash
+              - -xc
+              - |
+                sleep 1000
diff --git a/examples/failure-policy/onjobfailurereasons-present.yaml b/examples/failure-policy/onjobfailurereasons-present.yaml
@@ -0,0 +1,64 @@
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: onjobfailurereasons-present-example
+spec:
+  failurePolicy:
+    maxRestarts: 3
+    rules:
+      # The JobSet will restart an unlimited number of times when the
+      # leader job fails with the failure reason BackoffLimitExceeded.
+      - action: RestartJobSetAndIgnoreMaxRestarts 
+        targetReplicatedJobs:
+        - leader
+        onJobFailureReasons:
+        - BackoffLimitExceeded
+  replicatedJobs:
+  - name: leader
+    replicas: 1
+    template:
+      spec:
+        # Set backoff limit to 0 so job will immediately fail if any pod fails.
+        backoffLimit: 0
+        completions: 2
+        parallelism: 2
+        template:
+          spec:
+            containers:
+            - name: leader
+              image: bash:latest
+              command:
+              - bash
+              - -xc
+              - |
+                echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX"
+                if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+                  for i in $(seq 10 -1 1)
+                  do
+                    echo "Sleeping in $i"
+                    sleep 1
+                  done
+                  exit 1
+                fi
+                for i in $(seq 1 1000)
+                do
+                  echo "$i"
+                  sleep 1
+                done
+  - name: workers
+    replicas: 1
+    template:
+      spec:
+        backoffLimit: 0
+        completions: 2
+        parallelism: 2
+        template:
+          spec:
+            containers:
+            - name: worker
+              image: bash:latest
+              command:
+              - bash
+              - -xc
+              - |
+                sleep 1000
diff --git a/examples/failure-policy/restartjobset-action.yaml b/examples/failure-policy/restartjobset-action.yaml
@@ -0,0 +1,61 @@
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: restartjobset-action-example
+spec:
+  failurePolicy:
+    maxRestarts: 3
+    rules:
+      # The JobSet will restart when the leader job fails.
+      - action: RestartJobSet
+        targetReplicatedJobs:
+        - leader
+  replicatedJobs:
+  - name: leader
+    replicas: 1
+    template:
+      spec:
+        # Set backoff limit to 0 so job will immediately fail if any pod fails.
+        backoffLimit: 0
+        completions: 2
+        parallelism: 2
+        template:
+          spec:
+            containers:
+            - name: leader
+              image: bash:latest
+              command:
+              - bash
+              - -xc
+              - |
+                echo "JOB_COMPLETION_INDEX=$JOB_COMPLETION_INDEX"
+                if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+                  for i in $(seq 10 -1 1)
+                  do
+                    echo "Sleeping in $i"
+                    sleep 1
+                  done
+                  exit 1
+                fi
+                for i in $(seq 1 1000)
+                do
+                  echo "$i"
+                  sleep 1
+                done
+  - name: workers
+    replicas: 1
+    template:
+      spec:
+        backoffLimit: 0
+        completions: 2
+        parallelism: 2
+        template:
+          spec:
+            containers:
+            - name: worker
+              image: bash:latest
+              command:
+              - bash
+              - -xc
+              - |
+                sleep 1000