Skip to content

Commit d815e78

Browse files
Robustness to driver pod taking time to create (#2315)
* Retry after driver pod now found if recent submission Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> * Add a test Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> * Make grace period configurable Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> * Update test Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> * Add an extra test with the driver pod Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> * Separate context to create and delete the driver pod Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> * Tidy Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> * Autoformat Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> * Update error message Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> * Add helm paramater Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> * Update internal/controller/sparkapplication/controller.go Co-authored-by: Yi Chen <github@chenyicn.net> Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> * Newlines between helm tests Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> --------- Signed-off-by: Thomas Newton <thomas.w.newton@gmail.com> Co-authored-by: Yi Chen <github@chenyicn.net>
1 parent a261523 commit d815e78

File tree

7 files changed

+182
-12
lines changed

7 files changed

+182
-12
lines changed

charts/spark-operator-chart/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command docum
8686
| controller.replicas | int | `1` | Number of replicas of controller. |
8787
| controller.workers | int | `10` | Reconcile concurrency, higher values might increase memory usage. |
8888
| controller.logLevel | string | `"info"` | Configure the verbosity of logging, can be one of `debug`, `info`, `error`. |
89+
| controller.driverPodCreationGracePeriod | string | `"10s"` | Grace period after a successful spark-submit when driver pod not found errors will be retried. Useful if the driver pod can take some time to be created. |
8990
| controller.maxTrackedExecutorPerApp | int | `1000` | Specifies the maximum number of Executor pods that can be tracked by the controller per SparkApplication. |
9091
| controller.uiService.enable | bool | `true` | Specifies whether to create service for Spark web UI. |
9192
| controller.uiIngress.enable | bool | `false` | Specifies whether to create ingress for Spark web UI. `controller.uiService.enable` must be `true` to enable ingress. |

charts/spark-operator-chart/templates/controller/deployment.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ spec:
100100
{{- if .Values.controller.workqueueRateLimiter.maxDelay.enable }}
101101
- --workqueue-ratelimiter-max-delay={{ .Values.controller.workqueueRateLimiter.maxDelay.duration }}
102102
{{- end }}
103+
{{- if .Values.controller.driverPodCreationGracePeriod }}
104+
- --driver-pod-creation-grace-period={{ .Values.controller.driverPodCreationGracePeriod }}
105+
{{- end }}
103106
{{- if .Values.controller.maxTrackedExecutorPerApp }}
104107
- --max-tracked-executor-per-app={{ .Values.controller.maxTrackedExecutorPerApp }}
105108
{{- end }}

charts/spark-operator-chart/tests/controller/deployment_test.yaml

+10
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,16 @@ tests:
651651
- notContains:
652652
path: spec.template.spec.containers[?(@.name=="spark-operator-controller")].args
653653
content: --workqueue-ratelimiter-max-delay=1h
654+
655+
- it: Should contain `driver-pod-creation-grace-period` arg if `controller.driverPodCreationGracePeriod` is set
656+
set:
657+
controller:
658+
driverPodCreationGracePeriod: 30s
659+
asserts:
660+
- contains:
661+
path: spec.template.spec.containers[?(@.name=="spark-operator-controller")].args
662+
content: --driver-pod-creation-grace-period=30s
663+
654664
- it: Should contain `--max-tracked-executor-per-app` arg if `controller.maxTrackedExecutorPerApp` is set
655665
set:
656666
controller:

charts/spark-operator-chart/values.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ controller:
5151
# -- Configure the verbosity of logging, can be one of `debug`, `info`, `error`.
5252
logLevel: info
5353

54+
# -- Grace period after a successful spark-submit when driver pod not found errors will be retried. Useful if the driver pod can take some time to be created.
55+
driverPodCreationGracePeriod: 10s
56+
5457
# -- Specifies the maximum number of Executor pods that can be tracked by the controller per SparkApplication.
5558
maxTrackedExecutorPerApp: 1000
5659

cmd/operator/controller/start.go

+13-8
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ var (
100100
leaderElectionRenewDeadline time.Duration
101101
leaderElectionRetryPeriod time.Duration
102102

103+
driverPodCreationGracePeriod time.Duration
104+
103105
// Metrics
104106
enableMetrics bool
105107
metricsBindAddress string
@@ -163,6 +165,8 @@ func NewStartCommand() *cobra.Command {
163165
command.Flags().DurationVar(&leaderElectionRenewDeadline, "leader-election-renew-deadline", 14*time.Second, "Leader election renew deadline.")
164166
command.Flags().DurationVar(&leaderElectionRetryPeriod, "leader-election-retry-period", 4*time.Second, "Leader election retry period.")
165167

168+
command.Flags().DurationVar(&driverPodCreationGracePeriod, "driver-pod-creation-grace-period", 10*time.Second, "Grace period after a successful spark-submit when driver pod not found errors will be retried. Useful if the driver pod can take some time to be created.")
169+
166170
command.Flags().BoolVar(&enableMetrics, "enable-metrics", false, "Enable metrics.")
167171
command.Flags().StringVar(&metricsBindAddress, "metrics-bind-address", "0", "The address the metric endpoint binds to. "+
168172
"Use the port :8080. If not set, it will be 0 in order to disable the metrics server")
@@ -394,14 +398,15 @@ func newSparkApplicationReconcilerOptions() sparkapplication.Options {
394398
sparkExecutorMetrics.Register()
395399
}
396400
options := sparkapplication.Options{
397-
Namespaces: namespaces,
398-
EnableUIService: enableUIService,
399-
IngressClassName: ingressClassName,
400-
IngressURLFormat: ingressURLFormat,
401-
DefaultBatchScheduler: defaultBatchScheduler,
402-
SparkApplicationMetrics: sparkApplicationMetrics,
403-
SparkExecutorMetrics: sparkExecutorMetrics,
404-
MaxTrackedExecutorPerApp: maxTrackedExecutorPerApp,
401+
Namespaces: namespaces,
402+
EnableUIService: enableUIService,
403+
IngressClassName: ingressClassName,
404+
IngressURLFormat: ingressURLFormat,
405+
DefaultBatchScheduler: defaultBatchScheduler,
406+
DriverPodCreationGracePeriod: driverPodCreationGracePeriod,
407+
SparkApplicationMetrics: sparkApplicationMetrics,
408+
SparkExecutorMetrics: sparkExecutorMetrics,
409+
MaxTrackedExecutorPerApp: maxTrackedExecutorPerApp,
405410
}
406411
if enableBatchScheduler {
407412
options.KubeSchedulerNames = kubeSchedulerNames

internal/controller/sparkapplication/controller.go

+9-4
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ type Options struct {
6363
IngressURLFormat string
6464
DefaultBatchScheduler string
6565

66+
DriverPodCreationGracePeriod time.Duration
67+
6668
KubeSchedulerNames []string
6769

6870
SparkApplicationMetrics *metrics.SparkApplicationMetrics
@@ -773,10 +775,13 @@ func (r *Reconciler) updateDriverState(_ context.Context, app *v1beta2.SparkAppl
773775
}
774776

775777
if driverPod == nil {
776-
app.Status.AppState.State = v1beta2.ApplicationStateFailing
777-
app.Status.AppState.ErrorMessage = "driver pod not found"
778-
app.Status.TerminationTime = metav1.Now()
779-
return nil
778+
if app.Status.AppState.State != v1beta2.ApplicationStateSubmitted || metav1.Now().Sub(app.Status.LastSubmissionAttemptTime.Time) > r.options.DriverPodCreationGracePeriod {
779+
app.Status.AppState.State = v1beta2.ApplicationStateFailing
780+
app.Status.AppState.ErrorMessage = "driver pod not found"
781+
app.Status.TerminationTime = metav1.Now()
782+
return nil
783+
}
784+
return fmt.Errorf("driver pod not found, while inside the grace period. Grace period of %v expires at %v", r.options.DriverPodCreationGracePeriod, app.Status.LastSubmissionAttemptTime.Add(r.options.DriverPodCreationGracePeriod))
780785
}
781786

782787
app.Status.SparkApplicationID = util.GetSparkApplicationID(driverPod)

internal/controller/sparkapplication/controller_test.go

+143
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,149 @@ var _ = Describe("SparkApplication Controller", func() {
7474
})
7575
})
7676

77+
Context("When reconciling a submitted SparkApplication with no driver pod", func() {
78+
ctx := context.Background()
79+
appName := "test"
80+
appNamespace := "default"
81+
key := types.NamespacedName{
82+
Name: appName,
83+
Namespace: appNamespace,
84+
}
85+
86+
BeforeEach(func() {
87+
By("Creating a test SparkApplication")
88+
app := &v1beta2.SparkApplication{}
89+
if err := k8sClient.Get(ctx, key, app); err != nil && errors.IsNotFound(err) {
90+
app = &v1beta2.SparkApplication{
91+
ObjectMeta: metav1.ObjectMeta{
92+
Name: appName,
93+
Namespace: appNamespace,
94+
},
95+
Spec: v1beta2.SparkApplicationSpec{
96+
MainApplicationFile: util.StringPtr("local:///dummy.jar"),
97+
},
98+
}
99+
v1beta2.SetSparkApplicationDefaults(app)
100+
Expect(k8sClient.Create(ctx, app)).To(Succeed())
101+
102+
app.Status.AppState.State = v1beta2.ApplicationStateSubmitted
103+
app.Status.DriverInfo.PodName = "non-existent-driver"
104+
app.Status.LastSubmissionAttemptTime = metav1.NewTime(time.Now())
105+
Expect(k8sClient.Status().Update(ctx, app)).To(Succeed())
106+
}
107+
})
108+
109+
AfterEach(func() {
110+
app := &v1beta2.SparkApplication{}
111+
Expect(k8sClient.Get(ctx, key, app)).To(Succeed())
112+
113+
By("Deleting the created test SparkApplication")
114+
Expect(k8sClient.Delete(ctx, app)).To(Succeed())
115+
})
116+
117+
It("Should requeue submitted SparkApplication when driver pod not found inside the grace period", func() {
118+
By("Reconciling the created test SparkApplication")
119+
reconciler := sparkapplication.NewReconciler(
120+
nil,
121+
k8sClient.Scheme(),
122+
k8sClient,
123+
nil,
124+
nil,
125+
sparkapplication.Options{Namespaces: []string{appNamespace}, DriverPodCreationGracePeriod: 10 * time.Second},
126+
)
127+
_, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
128+
Expect(err).To(MatchError(ContainSubstring("driver pod not found, while inside the grace period. Grace period of")))
129+
app := &v1beta2.SparkApplication{}
130+
Expect(k8sClient.Get(ctx, key, app)).To(Succeed())
131+
Expect(app.Status.AppState.State).To(Equal(v1beta2.ApplicationStateSubmitted))
132+
})
133+
134+
It("Should fail a SparkApplication when driver pod not found outside the grace period", func() {
135+
By("Reconciling the created test SparkApplication")
136+
reconciler := sparkapplication.NewReconciler(
137+
nil,
138+
k8sClient.Scheme(),
139+
k8sClient,
140+
nil,
141+
nil,
142+
sparkapplication.Options{Namespaces: []string{appNamespace}, DriverPodCreationGracePeriod: 0 * time.Second},
143+
)
144+
result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
145+
Expect(err).NotTo(HaveOccurred())
146+
Expect(result.Requeue).To(BeFalse())
147+
148+
app := &v1beta2.SparkApplication{}
149+
Expect(k8sClient.Get(ctx, key, app)).To(Succeed())
150+
Expect(app.Status.AppState.State).To(Equal(v1beta2.ApplicationStateFailing))
151+
})
152+
})
153+
154+
Context("When reconciling a SparkApplication with driver pod", func() {
155+
ctx := context.Background()
156+
appName := "test"
157+
appNamespace := "default"
158+
key := types.NamespacedName{
159+
Name: appName,
160+
Namespace: appNamespace,
161+
}
162+
163+
BeforeEach(func() {
164+
By("Creating a test SparkApplication")
165+
app := &v1beta2.SparkApplication{}
166+
if err := k8sClient.Get(ctx, key, app); err != nil && errors.IsNotFound(err) {
167+
app = &v1beta2.SparkApplication{
168+
ObjectMeta: metav1.ObjectMeta{
169+
Name: appName,
170+
Namespace: appNamespace,
171+
},
172+
Spec: v1beta2.SparkApplicationSpec{
173+
MainApplicationFile: util.StringPtr("local:///dummy.jar"),
174+
},
175+
}
176+
v1beta2.SetSparkApplicationDefaults(app)
177+
Expect(k8sClient.Create(ctx, app)).To(Succeed())
178+
179+
app.Status.AppState.State = v1beta2.ApplicationStateSubmitted
180+
driverPod := createDriverPod(appName, appNamespace)
181+
Expect(k8sClient.Create(ctx, driverPod)).To(Succeed())
182+
app.Status.DriverInfo.PodName = driverPod.Name
183+
Expect(k8sClient.Status().Update(ctx, app)).To(Succeed())
184+
}
185+
})
186+
187+
AfterEach(func() {
188+
app := &v1beta2.SparkApplication{}
189+
Expect(k8sClient.Get(ctx, key, app)).To(Succeed())
190+
191+
By("Deleting the created test SparkApplication")
192+
Expect(k8sClient.Delete(ctx, app)).To(Succeed())
193+
194+
By("Deleting the driver pod")
195+
driverPod := &corev1.Pod{}
196+
Expect(k8sClient.Get(ctx, getDriverNamespacedName(appName, appNamespace), driverPod)).To(Succeed())
197+
Expect(k8sClient.Delete(ctx, driverPod)).To(Succeed())
198+
})
199+
200+
It("When reconciling a submitted SparkApplication when driver pod exists", func() {
201+
By("Reconciling the created test SparkApplication")
202+
reconciler := sparkapplication.NewReconciler(
203+
nil,
204+
k8sClient.Scheme(),
205+
k8sClient,
206+
nil,
207+
nil,
208+
sparkapplication.Options{Namespaces: []string{appNamespace}, DriverPodCreationGracePeriod: 0 * time.Second},
209+
)
210+
result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
211+
Expect(err).NotTo(HaveOccurred())
212+
Expect(result.Requeue).To(BeFalse())
213+
214+
app := &v1beta2.SparkApplication{}
215+
Expect(k8sClient.Get(ctx, key, app)).To(Succeed())
216+
Expect(app.Status.AppState.State).To(Equal(v1beta2.ApplicationStateSubmitted))
217+
})
218+
})
219+
77220
Context("When reconciling a completed SparkApplication", func() {
78221
ctx := context.Background()
79222
appName := "test"

0 commit comments

Comments
 (0)