-
Notifications
You must be signed in to change notification settings - Fork 696
/
hpa.go
102 lines (91 loc) · 3.36 KB
/
hpa.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
// Copyright 2021 The Kubeflow Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License
package pytorch
import (
"context"
autoscalingv2 "k8s.io/api/autoscaling/v2"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
trainutil "github.com/kubeflow/training-operator/pkg/util/train"
)
func (r *PyTorchJobReconciler) ReconcileHPA(pytorchJob *kubeflowv1.PyTorchJob) error {
logger := r.Log.WithValues(kubeflowv1.PyTorchJobSingular, pytorchJob.Name)
if pytorchJob.Spec.ElasticPolicy == nil || pytorchJob.Spec.ElasticPolicy.Metrics == nil {
logger.V(1).Info(
"No ElasicPolicy or Metric is specified, skipping HPA reconciling process")
return nil
}
current := &autoscalingv2.HorizontalPodAutoscaler{}
// Get the expected HPA.
expected, err := desiredHPA(pytorchJob, r.Scheme)
if err != nil {
return err
}
err = r.Get(context.TODO(), client.ObjectKeyFromObject(expected), current)
if err != nil {
if errors.IsNotFound(err) {
if trainutil.IsJobSuspended(&pytorchJob.Spec.RunPolicy) {
// If the job is suspended, it's correct behavior that HPA doesn't exist.
return nil
}
// Create the new HPA.
logger.V(1).Info("Creating HPA", "namespace", expected.Namespace, "name", expected.Name)
return r.Create(context.TODO(), expected)
}
return err
}
if trainutil.IsJobSuspended(&pytorchJob.Spec.RunPolicy) {
// Delete the current HPA
logger.V(1).Info("Deleting HPA", "HorizontalPodAutoscaler", klog.KObj(current))
return r.Delete(context.TODO(), current)
}
if !equality.Semantic.DeepEqual(expected.Spec, current.Spec) {
logger.V(1).Info("Updating HPA", "namespace", current.Namespace, "name", current.Name)
expected.ResourceVersion = current.ResourceVersion
err = r.Update(context.TODO(), expected)
if err != nil {
return err
}
}
return nil
}
func desiredHPA(pytorchJob *kubeflowv1.PyTorchJob, scheme *runtime.Scheme) (
*autoscalingv2.HorizontalPodAutoscaler, error) {
hpa := &autoscalingv2.HorizontalPodAutoscaler{
ObjectMeta: metav1.ObjectMeta{
Name: pytorchJob.Name,
Namespace: pytorchJob.Namespace,
},
Spec: autoscalingv2.HorizontalPodAutoscalerSpec{
ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{
Kind: pytorchJob.Kind,
Name: pytorchJob.Name,
APIVersion: pytorchJob.APIVersion,
},
MinReplicas: pytorchJob.Spec.ElasticPolicy.MinReplicas,
MaxReplicas: *pytorchJob.Spec.ElasticPolicy.MaxReplicas,
Metrics: pytorchJob.Spec.ElasticPolicy.Metrics,
},
}
if err := controllerruntime.SetControllerReference(pytorchJob, hpa, scheme); err != nil {
return nil, err
}
return hpa, nil
}