Skip to content

Commit c855ee4

Browse files
authored
Fix: spark application does not respect time to live seconds (kubeflow#2165)
* Add time to live seconds example spark application Signed-off-by: Yi Chen <github@chenyicn.net> * fix: spark application does not respect time to live seconds Signed-off-by: Yi Chen <github@chenyicn.net> --------- Signed-off-by: Yi Chen <github@chenyicn.net>
1 parent a2f71c6 commit c855ee4

File tree

3 files changed

+95
-73
lines changed

3 files changed

+95
-73
lines changed

examples/spark-pi-ttl.yaml

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#
2+
# Copyright 2024 The Kubeflow authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# https://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
apiVersion: sparkoperator.k8s.io/v1beta2
17+
kind: SparkApplication
18+
metadata:
19+
name: spark-pi-ttl
20+
namespace: default
21+
spec:
22+
type: Scala
23+
mode: cluster
24+
image: spark:3.5.2
25+
imagePullPolicy: IfNotPresent
26+
mainClass: org.apache.spark.examples.SparkPi
27+
mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.2.jar
28+
sparkVersion: 3.5.2
29+
timeToLiveSeconds: 30
30+
driver:
31+
cores: 1
32+
memory: 512m
33+
serviceAccount: spark-operator-spark
34+
executor:
35+
instances: 1
36+
cores: 1
37+
memory: 512m

internal/controller/sparkapplication/controller.go

+52-73
Original file line numberDiff line numberDiff line change
@@ -371,9 +371,11 @@ func (r *Reconciler) reconcileRunningSparkApplication(ctx context.Context, req c
371371
if err := r.updateSparkApplicationState(ctx, app); err != nil {
372372
return err
373373
}
374+
374375
if err := r.updateSparkApplicationStatus(ctx, app); err != nil {
375376
return err
376377
}
378+
377379
return nil
378380
},
379381
)
@@ -529,85 +531,62 @@ func (r *Reconciler) reconcileFailingSparkApplication(ctx context.Context, req c
529531
}
530532

531533
func (r *Reconciler) reconcileCompletedSparkApplication(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
532-
key := req.NamespacedName
533-
retryErr := retry.RetryOnConflict(
534-
retry.DefaultRetry,
535-
func() error {
536-
old, err := r.getSparkApplication(key)
537-
if err != nil {
538-
return err
539-
}
540-
if old.Status.AppState.State != v1beta2.ApplicationStateCompleted {
541-
return nil
542-
}
543-
app := old.DeepCopy()
544-
545-
if util.IsExpired(app) {
546-
logger.Info("Deleting expired SparkApplication", "name", app.Name, "namespace", app.Namespace, "state", app.Status.AppState.State)
547-
if err := r.client.Delete(ctx, app); err != nil {
548-
return err
549-
}
550-
return nil
551-
}
552-
if err := r.updateExecutorState(ctx, app); err != nil {
553-
return err
554-
}
555-
if err := r.updateSparkApplicationStatus(ctx, app); err != nil {
556-
return err
557-
}
558-
if err := r.cleanUpOnTermination(old, app); err != nil {
559-
logger.Error(err, "Failed to clean up resources for SparkApplication", "name", old.Name, "namespace", old.Namespace, "state", old.Status.AppState.State)
560-
return err
561-
}
562-
return nil
563-
},
564-
)
565-
if retryErr != nil {
566-
logger.Error(retryErr, "Failed to reconcile SparkApplication", "name", key.Name, "namespace", key.Namespace)
567-
return ctrl.Result{}, retryErr
568-
}
569-
return ctrl.Result{}, nil
534+
return r.reconcileTerminatedSparkApplication(ctx, req)
570535
}
571536

572537
func (r *Reconciler) reconcileFailedSparkApplication(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
538+
return r.reconcileTerminatedSparkApplication(ctx, req)
539+
}
540+
541+
func (r *Reconciler) reconcileTerminatedSparkApplication(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
573542
key := req.NamespacedName
574-
retryErr := retry.RetryOnConflict(
575-
retry.DefaultRetry,
576-
func() error {
577-
old, err := r.getSparkApplication(key)
578-
if err != nil {
579-
return err
580-
}
581-
if old.Status.AppState.State != v1beta2.ApplicationStateFailed {
582-
return nil
583-
}
584-
app := old.DeepCopy()
543+
old, err := r.getSparkApplication(key)
544+
if err != nil {
545+
return ctrl.Result{Requeue: true}, err
546+
}
585547

586-
if util.IsExpired(app) {
587-
logger.Info("Deleting expired SparkApplication", "name", app.Name, "namespace", app.Namespace, "state", app.Status.AppState.State)
588-
if err := r.client.Delete(ctx, app); err != nil {
589-
return err
590-
}
591-
return nil
592-
}
593-
if err := r.updateExecutorState(ctx, app); err != nil {
594-
return err
595-
}
596-
if err := r.updateSparkApplicationStatus(ctx, app); err != nil {
597-
return err
598-
}
599-
if err := r.cleanUpOnTermination(old, app); err != nil {
600-
logger.Error(err, "Failed to clean up resources for SparkApplication", "name", old.Name, "namespace", old.Namespace, "state", old.Status.AppState.State)
601-
return err
602-
}
603-
return nil
604-
},
605-
)
606-
if retryErr != nil {
607-
logger.Error(retryErr, "Failed to reconcile SparkApplication", "name", key.Name, "namespace", key.Namespace)
608-
return ctrl.Result{}, retryErr
548+
app := old.DeepCopy()
549+
if !util.IsTerminated(app) {
550+
return ctrl.Result{}, nil
609551
}
610-
return ctrl.Result{}, nil
552+
553+
if util.IsExpired(app) {
554+
logger.Info("Deleting expired SparkApplication", "name", app.Name, "namespace", app.Namespace, "state", app.Status.AppState.State)
555+
if err := r.client.Delete(ctx, app); err != nil {
556+
return ctrl.Result{Requeue: true}, err
557+
}
558+
return ctrl.Result{}, nil
559+
}
560+
561+
if err := r.updateExecutorState(ctx, app); err != nil {
562+
return ctrl.Result{Requeue: true}, err
563+
}
564+
565+
if err := r.updateSparkApplicationStatus(ctx, app); err != nil {
566+
return ctrl.Result{Requeue: true}, err
567+
}
568+
569+
if err := r.cleanUpOnTermination(old, app); err != nil {
570+
logger.Error(err, "Failed to clean up resources for SparkApplication", "name", old.Name, "namespace", old.Namespace, "state", old.Status.AppState.State)
571+
return ctrl.Result{Requeue: true}, err
572+
}
573+
574+
// If termination time or TTL is not set, will not requeue this application.
575+
if app.Status.TerminationTime.IsZero() || app.Spec.TimeToLiveSeconds == nil || *app.Spec.TimeToLiveSeconds <= 0 {
576+
return ctrl.Result{}, nil
577+
}
578+
579+
// Otherwise, requeue the application for subsequent deletion.
580+
now := time.Now()
581+
ttl := time.Duration(*app.Spec.TimeToLiveSeconds) * time.Second
582+
survival := now.Sub(app.Status.TerminationTime.Time)
583+
584+
// If survival time is greater than TTL, requeue the application immediately.
585+
if survival >= ttl {
586+
return ctrl.Result{Requeue: true}, nil
587+
}
588+
// Otherwise, requeue the application after (TTL - survival) seconds.
589+
return ctrl.Result{RequeueAfter: ttl - survival}, nil
611590
}
612591

613592
func (r *Reconciler) reconcileUnknownSparkApplication(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {

pkg/util/sparkapplication.go

+6
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,12 @@ func GetApplicationState(app *v1beta2.SparkApplication) v1beta2.ApplicationState
5151
return app.Status.AppState.State
5252
}
5353

54+
// IsTerminated returns whether the given SparkApplication is terminated.
55+
func IsTerminated(app *v1beta2.SparkApplication) bool {
56+
return app.Status.AppState.State == v1beta2.ApplicationStateCompleted ||
57+
app.Status.AppState.State == v1beta2.ApplicationStateFailed
58+
}
59+
5460
// IsExpired returns whether the given SparkApplication is expired.
5561
func IsExpired(app *v1beta2.SparkApplication) bool {
5662
// The application has no TTL defined and will never expire.

0 commit comments

Comments
 (0)