Add get/retry experiment commands. Support experiment retries

argoproj · Nov 5, 2019 · 782337f · 782337f
1 parent 353c60e
commit 782337f
Show file tree

Hide file tree

Showing 25 changed files with 1,087 additions and 504 deletions.
diff --git a/examples/analysis-templates.yaml b/examples/analysis-templates.yaml
@@ -0,0 +1,52 @@
+# This file contains AnalysisTemplates referenced by Rollouts and Experiments in the examples.
+# Please apply this file first, before running any of the examples.
+
+---
+# This AnalysisTemplate will run a Kubernetes Job every 5 seconds that succeeds.
+kind: AnalysisTemplate
+apiVersion: argoproj.io/v1alpha1
+metadata:
+  name: pass
+spec:
+  metrics:
+  - name: pass
+    interval: 5
+    maxFailures: 1
+    provider:
+      job:
+        spec:
+          template:
+            spec:
+              containers:
+              - name: sleep
+                image: alpine:3.8
+                command: [sh, -c]
+                args: [exit 0]
+              restartPolicy: Never
+          backoffLimit: 0
+
+---
+# This AnalysisTemplate will run a Kubernetes Job every 5 seconds, with a 50% chance of failure.
+# When the number of accumulated failures exceeds maxFailures, it will cause the analysis run to
+# fail, and subsequently cause the rollout or experiment to abort.
+kind: AnalysisTemplate
+apiVersion: argoproj.io/v1alpha1
+metadata:
+  name: random-fail
+spec:
+  metrics:
+  - name: random-fail
+    interval: 5
+    maxFailures: 1
+    provider:
+      job:
+        spec:
+          template:
+            spec:
+              containers:
+              - name: sleep
+                image: alpine:3.8
+                command: [sh, -c]
+                args: [FLIP=$(($(($RANDOM%10))%2)) && exit $FLIP]
+              restartPolicy: Never
+          backoffLimit: 0
diff --git a/examples/experiment-with-analysis.yaml b/examples/experiment-with-analysis.yaml
@@ -0,0 +1,43 @@
+# This example demonstrates an experiment which starts two ReplicaSets with different images, and
+# additionally starts an AnalysisRun in the background
+#
+# Prerequisites:
+# * kubectl apply -f analysis-templates.yaml
+#
+apiVersion: argoproj.io/v1alpha1
+kind: Experiment
+metadata:
+  name: experiment-with-analysis
+spec:
+  templates:
+  - name: purple
+    selector:
+      matchLabels:
+        app: rollouts-demo
+    template:
+      metadata:
+        labels:
+          app: rollouts-demo
+      spec:
+        containers:
+        - name: rollouts-demo
+          image: argoproj/rollouts-demo:purple
+          imagePullPolicy: Always
+  - name: orange
+    selector:
+      matchLabels:
+        app: rollouts-demo
+    template:
+      metadata:
+        labels:
+          app: rollouts-demo
+      spec:
+        containers:
+        - name: rollouts-demo
+          image: argoproj/rollouts-demo:orange
+          imagePullPolicy: Always
+  analyses:
+  - name: random-fail
+    templateName: random-fail
+  - name: pass
+    templateName: pass
diff --git a/examples/rollout-analysis-step.yaml b/examples/rollout-analysis-step.yaml
@@ -1,4 +1,8 @@
 # This example demonstrates a Rollout which starts and finishes analysis at a specific canary step
+#
+# Prerequisites:
+# kubectl apply -f analysis-templates.yaml
+#
 apiVersion: argoproj.io/v1alpha1
 kind: Rollout
 metadata:
@@ -31,29 +35,3 @@ spec:
       - analysis:
           name: random-fail
           templateName: random-fail
-
----
-# This AnalysisTemplate will run a Kubernetes Job every 5 seconds, with a 50% chance of failure.
-# When the number of accumulated failures exceeds maxFailures, it will cause the analysis run to
-# fail, and subsequently cause the rollout to abort.
-kind: AnalysisTemplate
-apiVersion: argoproj.io/v1alpha1
-metadata:
-  name: random-fail
-spec:
-  metrics:
-  - name: random-fail
-    interval: 5
-    maxFailures: 1
-    provider:
-      job:
-        spec:
-          template:
-            spec:
-              containers:
-              - name: sleep
-                image: alpine:3.8
-                command: [sh, -c]
-                args: [FLIP=$(($(($RANDOM%10))%2)) && exit $FLIP]
-              restartPolicy: Never
-          backoffLimit: 0
diff --git a/examples/rollout-background-analysis.yaml b/examples/rollout-background-analysis.yaml
@@ -1,4 +1,8 @@
 # This example demonstrates a Rollout which performs background analysis while the Rollout is updating.
+#
+# Prerequisites:
+# * kubectl apply -f analysis-templates.yaml
+#
 apiVersion: argoproj.io/v1alpha1
 kind: Rollout
 metadata:
@@ -31,29 +35,3 @@ spec:
       steps:
       - setWeight: 25
       - pause: {}
-
----
-# This AnalysisTemplate will run a Kubernetes Job every 5 seconds, with a 50% chance of failure.
-# When the number of accumulated failures exceeds maxFailures, it will cause the analysis run to
-# fail, and subsequently cause the rollout to abort.
-kind: AnalysisTemplate
-apiVersion: argoproj.io/v1alpha1
-metadata:
-  name: random-fail
-spec:
-  metrics:
-  - name: random-fail
-    interval: 5
-    maxFailures: 1
-    provider:
-      job:
-        spec:
-          template:
-            spec:
-              containers:
-              - name: sleep
-                image: alpine:3.8
-                command: [sh, -c]
-                args: [FLIP=$(($(($RANDOM%10))%2)) && exit $FLIP]
-              restartPolicy: Never
-          backoffLimit: 0
diff --git a/examples/rollout-experiment-step.yaml b/examples/rollout-experiment-step.yaml
@@ -1,8 +1,12 @@
 # This example demonstrates a Rollout which begins an an experiment at a specified step.
-# The rollout willl not proceed to the next step until the experiment is completed and successful.
+# The rollout will not proceed to the next step until the experiment is completed and successful.
 # In this example, the experiment itself starts its own AnalysisRun which is tied to the experiment.
 # This is useful for when analysis should be done only during the experimentation phase, but not
 # during the regular update of the rollout.
+#
+# Prerequisites:
+# * kubectl apply -f analysis-templates.yaml
+#
 apiVersion: argoproj.io/v1alpha1
 kind: Rollout
 metadata:
@@ -38,29 +42,3 @@ spec:
           analyses:
           - name: random-fail
             templateName: random-fail
-
----
-# This AnalysisTemplate will run a Kubernetes Job every 5 seconds, with a 50% chance of failure.
-# When the number of accumulated failures exceeds maxFailures, it will cause the analysis run to
-# fail, and subsequently cause the rollout to abort.
-kind: AnalysisTemplate
-apiVersion: argoproj.io/v1alpha1
-metadata:
-  name: random-fail
-spec:
-  metrics:
-  - name: random-fail
-    interval: 5
-    maxFailures: 1
-    provider:
-      job:
-        spec:
-          template:
-            spec:
-              containers:
-              - name: sleep
-                image: alpine:3.8
-                command: [sh, -c]
-                args: [FLIP=$(($(($RANDOM%10))%2)) && exit $FLIP]
-              restartPolicy: Never
-          backoffLimit: 0
diff --git a/experiments/experiment.go b/experiments/experiment.go
@@ -337,31 +337,16 @@ func (ec *experimentContext) reconcileAnalysisRun(analysis v1alpha1.ExperimentAn
 	newStatus.Message = run.Status.Message
 }
 
-// createAnalysisRun creates the analysis run. If an existing runs exists with same name, and is
-// semantically equal, returns the existing one, otherwise errors
+// createAnalysisRun creates the analysis run. If an existing runs exists with same name, is
+// semantically equal, and is not complete, returns the existing one, otherwise creates a new
+// run with a collision counter increase.
 func (ec *experimentContext) createAnalysisRun(analysis v1alpha1.ExperimentAnalysisTemplateRef) (*v1alpha1.AnalysisRun, error) {
 	analysisRunIf := ec.argoProjClientset.ArgoprojV1alpha1().AnalysisRuns(ec.ex.Namespace)
 	run, err := ec.newAnalysisRun(analysis, analysis.Arguments)
 	if err != nil {
 		return nil, err
 	}
-	newRun, createErr := analysisRunIf.Create(run)
-	if createErr != nil {
-		if !k8serrors.IsAlreadyExists(createErr) {
-			return nil, createErr
-		}
-		existingRun, err := analysisRunIf.Get(run.Name, metav1.GetOptions{})
-		if err != nil {
-			return nil, err
-		}
-		controllerRef := metav1.GetControllerOf(existingRun)
-		if ec.ex.UID == controllerRef.UID && analysisutil.IsSemanticallyEqual(run.Spec, existingRun.Spec) {
-			ec.log.Infof("Claimed existing analysisrun %s", existingRun.Name)
-			return existingRun, nil
-		}
-		return nil, createErr
-	}
-	return newRun, nil
+	return analysisutil.CreateWithCollisionCounter(ec.log, analysisRunIf, *run)
 }
 
 func (ec *experimentContext) calculateStatus() *v1alpha1.ExperimentStatus {