Skip to content

Commit 2d8cd56

Browse files
authored
Support cluster-autoscaler 1.30.x yaml status (#297)
* Add parsing of yaml status, present since cluster-autoscaler 1.30 * Remove parsing of LongNotStarted due to deprecation since cluster-autoscaler 1.22
1 parent 95b7daa commit 2d8cd56

File tree

16 files changed

+825
-234
lines changed

16 files changed

+825
-234
lines changed

README.md

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -56,22 +56,23 @@ Follow Kubestitute documentation for Helm deployment [here](./helm/kubestitute).
5656
### <a id="Configuration_Optional_args"></a>Optional args
5757

5858
The kubestitute container takes as argument the parameters below.
59-
| Key | Description | Default |
60-
| ----------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- | ------------------------------------ |
61-
| clusterautoscaler-namespace | The namespace the clusterautoscaler belongs to. | kube-system |
62-
| clusterautoscaler-status-name | The names of the clusterautoscaler status configmap. | cluster-autoscaler-status |
63-
| cluster-autoscaler-priority-expander-config-map | The name of the clusterautoscaler priority expander config map. | cluster-autoscaler-priority-expander |
64-
| priority-expander-enabled | Is the PriorityExpander controller enabled. | `false` |
65-
| priority-expander-namespace | The namespace the _unique_ priority expander object belongs to. | kubestitute-system |
66-
| priority-expander-name | The only accepted name for the priority expander object. | priority-expander-default |
67-
| dev | Enable dev mode for logging. | `false` |
68-
| v | Logs verbosity. 0 => panic, 1 => error, 2 => warning, 3 => info, 4 => debug | 3 |
69-
| asg-poll-interval | AutoScaling Groups polling interval (used to generate custom metrics about ASGs). | 30 |
70-
| eviction-timeout | The timeout in seconds for pods eviction on Instance deletion. | 300 |
71-
| instances-max-concurrent-reconciles | The maximum number of concurrent Reconciles which can be run for Instances. | 10 |
72-
| metrics-bind-address | The address the metric endpoint binds to. | :8080 |
73-
| health-probe-bind-address | The address the probe endpoint binds to. | :8081 |
74-
| leader-elect | Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager. | `false` |
59+
| Key | Description | Default |
60+
| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------ |
61+
| clusterautoscaler-namespace | The namespace the clusterautoscaler belongs to. | kube-system |
62+
| clusterautoscaler-status-name | The names of the clusterautoscaler status configmap. | cluster-autoscaler-status |
63+
| cluster-autoscaler-priority-expander-config-map | The name of the clusterautoscaler priority expander config map. | cluster-autoscaler-priority-expander |
64+
| clusterautoscaler-status-legacy-format | Set if the clusterautoscaler status configmap is formatted the legacy readable format, used by cluster-autoscaler up to version 1.29. | `false` |
65+
| priority-expander-enabled | Is the PriorityExpander controller enabled. | `false` |
66+
| priority-expander-namespace | The namespace the _unique_ priority expander object belongs to. | kubestitute-system |
67+
| priority-expander-name | The only accepted name for the priority expander object. | priority-expander-default |
68+
| dev | Enable dev mode for logging. | `false` |
69+
| v | Logs verbosity. 0 => panic, 1 => error, 2 => warning, 3 => info, 4 => debug | 3 |
70+
| asg-poll-interval | AutoScaling Groups polling interval (used to generate custom metrics about ASGs). | 30 |
71+
| eviction-timeout | The timeout in seconds for pods eviction on Instance deletion. | 300 |
72+
| instances-max-concurrent-reconciles | The maximum number of concurrent Reconciles which can be run for Instances. | 10 |
73+
| metrics-bind-address | The address the metric endpoint binds to. | :8080 |
74+
| health-probe-bind-address | The address the probe endpoint binds to. | :8081 |
75+
| leader-elect | Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager. | `false` |
7576

7677
## CustomResourceDefinitions
7778

api/v1alpha1/scheduler_types.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,8 @@ type IntOrArithmeticOperation struct {
161161
// It is based on ASG health status.
162162
type Field string
163163

164-
// All Field constants
164+
// All Field constants.
165+
// LongNotStarted is deprecated and will always be 0.
165166
const (
166167
FieldReady Field = "Ready"
167168
FieldUnready Field = "Unready"

controllers/priorityexpander_controller.go

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,12 @@ import (
4444
)
4545

4646
type PriorityExpanderReconcilerConfiguration struct {
47-
ClusterAutoscalerNamespace string
48-
ClusterAutoscalerStatusName string
49-
ClusterAutoscalerPEConfigMapName string
50-
PriorityExpanderNamespace string
51-
PriorityExpanderName string
47+
ClusterAutoscalerNamespace string
48+
ClusterAutoscalerStatusName string
49+
ClusterAutoscalerStatusLegacyFormat bool
50+
ClusterAutoscalerPEConfigMapName string
51+
PriorityExpanderNamespace string
52+
PriorityExpanderName string
5253
}
5354

5455
type PriorityExpanderReconciler struct {
@@ -125,20 +126,30 @@ func (r *PriorityExpanderReconciler) Reconcile(ctx context.Context, req ctrl.Req
125126
}
126127

127128
// ... and parse it.
128-
status := clusterautoscaler.ParseReadableString(readableStatus)
129+
var status *clusterautoscaler.ClusterAutoscalerStatus
130+
if !r.Configuration.ClusterAutoscalerStatusLegacyFormat {
131+
s, err := clusterautoscaler.ParseYamlStatus(readableStatus)
132+
if err != nil {
133+
log.Error(err, "Unable to parse status configmap yaml content")
134+
return ctrl.Result{}, fmt.Errorf("unable to parse status configmap yaml content: %w", err)
135+
}
136+
status = s
137+
} else {
138+
status = clusterautoscaler.ParseReadableStatus(readableStatus)
139+
}
129140

130-
var oroot = map[string]map[string]int32{}
141+
oroot := map[string]map[string]int32{}
131142
for _, node := range status.NodeGroups {
132143
oroot[node.Name] = make(map[string]int32)
133-
oroot[node.Name]["CloudProviderTarget"] = node.Health.CloudProviderTarget
134-
oroot[node.Name]["Ready"] = node.Health.Ready
135-
oroot[node.Name]["Unready"] = node.Health.Unready
136-
oroot[node.Name]["NotStarted"] = node.Health.NotStarted
137-
oroot[node.Name]["LongNotStarted"] = node.Health.LongNotStarted
138-
oroot[node.Name]["Registered"] = node.Health.Registered
139-
oroot[node.Name]["LongUnregistered"] = node.Health.LongUnregistered
140-
oroot[node.Name]["MinSize"] = node.Health.MinSize
141-
oroot[node.Name]["MaxSize"] = node.Health.MaxSize
144+
oroot[node.Name]["CloudProviderTarget"] = int32(node.Health.CloudProviderTarget)
145+
oroot[node.Name]["Ready"] = int32(node.Health.NodeCounts.Registered.Ready)
146+
oroot[node.Name]["Unready"] = int32(node.Health.NodeCounts.Registered.Unready.Total)
147+
oroot[node.Name]["NotStarted"] = int32(node.Health.NodeCounts.Registered.NotStarted)
148+
oroot[node.Name]["LongNotStarted"] = 0
149+
oroot[node.Name]["Registered"] = int32(node.Health.NodeCounts.Registered.Total)
150+
oroot[node.Name]["LongUnregistered"] = int32(node.Health.NodeCounts.LongUnregistered)
151+
oroot[node.Name]["MinSize"] = int32(node.Health.MinSize)
152+
oroot[node.Name]["MaxSize"] = int32(node.Health.MaxSize)
142153
}
143154

144155
// Create new PriorityExpander template and parse it
@@ -169,7 +180,6 @@ func (r *PriorityExpanderReconciler) Reconcile(ctx context.Context, req ctrl.Req
169180
}
170181

171182
op, err := ctrl.CreateOrUpdate(ctx, r.Client, &pecm, func() error {
172-
173183
pecm.Data = map[string]string{
174184
"priorities": buf.String(),
175185
}

controllers/scheduler_controller.go

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,9 @@ const (
5555

5656
// SchedulerReconcilerConfiguration wraps configuration for the SchedulerReconciler.
5757
type SchedulerReconcilerConfiguration struct {
58-
ClusterAutoscalerNamespace string
59-
ClusterAutoscalerStatusName string
58+
ClusterAutoscalerNamespace string
59+
ClusterAutoscalerStatusName string
60+
ClusterAutoscalerStatusLegacyFormat bool
6061
}
6162

6263
// SchedulerReconciler reconciles a Scheduler object
@@ -138,13 +139,23 @@ func (r *SchedulerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
138139
}
139140

140141
// Parse it and retrieve NodeGroups from targets and fallbacks
141-
status := clusterautoscaler.ParseReadableString(readableStatus)
142+
var status *clusterautoscaler.ClusterAutoscalerStatus
143+
if !r.Configuration.ClusterAutoscalerStatusLegacyFormat {
144+
s, err := clusterautoscaler.ParseYamlStatus(readableStatus)
145+
if err != nil {
146+
log.Error(err, "Unable to parse status configmap yaml content")
147+
return ctrl.Result{}, fmt.Errorf("unable to parse status configmap yaml content: %w", err)
148+
}
149+
status = s
150+
} else {
151+
status = clusterautoscaler.ParseReadableStatus(readableStatus)
152+
}
142153

143154
asgTargets := scheduler.Spec.ASGTargets
144155
if len(asgTargets) == 0 {
145156
asgTargets = []string{scheduler.Spec.ASGTarget}
146157
}
147-
targetNodeGroups := make([]clusterautoscaler.NodeGroup, 0, len(asgTargets))
158+
targetNodeGroups := make([]clusterautoscaler.NodeGroupStatus, 0, len(asgTargets))
148159
for _, target := range asgTargets {
149160
targetNodeGroup := clusterautoscaler.GetNodeGroupWithName(status.NodeGroups, target)
150161
if targetNodeGroup == nil {
@@ -162,12 +173,12 @@ func (r *SchedulerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
162173

163174
// Update target statuses
164175
for i := range targetNodeGroups {
165-
for _, s := range []clusterautoscaler.ScaleUpStatus{
166-
clusterautoscaler.ScaleUpNeeded,
167-
clusterautoscaler.ScaleUpNotNeeded,
168-
clusterautoscaler.ScaleUpInProgress,
169-
clusterautoscaler.ScaleUpNoActivity,
170-
clusterautoscaler.ScaleUpBackoff,
176+
for _, s := range []clusterautoscaler.ClusterAutoscalerConditionStatus{
177+
clusterautoscaler.ClusterAutoscalerNeeded,
178+
clusterautoscaler.ClusterAutoscalerNotNeeded,
179+
clusterautoscaler.ClusterAutoscalerInProgress,
180+
clusterautoscaler.ClusterAutoscalerNoActivity,
181+
clusterautoscaler.ClusterAutoscalerBackoff,
171182
} {
172183
targetNodeGroupStatus := metrics.SchedulerTargetNodeGroupStatus.With(prometheus.Labels{
173184
"node_group_name": targetNodeGroups[i].Name,
@@ -277,7 +288,7 @@ func (r *SchedulerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
277288
if down > 0 {
278289
scaleDownAllowed := false
279290
for i := range targetNodeGroups {
280-
if targetNodeGroups[i].ScaleUp.Status != clusterautoscaler.ScaleUpBackoff {
291+
if targetNodeGroups[i].ScaleUp.Status != clusterautoscaler.ClusterAutoscalerBackoff {
281292
scaleDownAllowed = true
282293
break
283294
}
@@ -511,7 +522,7 @@ func getMatchedPolicy(m []matchedPolicy, p corev1alpha1.SchedulerPolicy) *matche
511522
// nodeGroupIntOrFieldValue returns the desired value matching IntOrField.
512523
// Field returns the NodeGroup Field value ans has priority over Int if a valid
513524
// Field is given.
514-
func nodeGroupIntOrFieldValue(ngs []clusterautoscaler.NodeGroup, iof corev1alpha1.IntOrField) int32 {
525+
func nodeGroupIntOrFieldValue(ngs []clusterautoscaler.NodeGroupStatus, iof corev1alpha1.IntOrField) int32 {
515526
if iof.FieldVal == nil {
516527
return iof.IntVal
517528
}
@@ -520,39 +531,37 @@ func nodeGroupIntOrFieldValue(ngs []clusterautoscaler.NodeGroup, iof corev1alpha
520531
switch *iof.FieldVal {
521532
case corev1alpha1.FieldReady:
522533
for i := range ngs {
523-
val += ngs[i].Health.Ready
534+
val += int32(ngs[i].Health.NodeCounts.Registered.Ready)
524535
}
525536
case corev1alpha1.FieldUnready:
526537
for i := range ngs {
527-
val += ngs[i].Health.Unready
538+
val += int32(ngs[i].Health.NodeCounts.Registered.Unready.Total)
528539
}
529540
case corev1alpha1.FieldNotStarted:
530541
for i := range ngs {
531-
val += ngs[i].Health.NotStarted
542+
val += int32(ngs[i].Health.NodeCounts.Registered.NotStarted)
532543
}
533544
case corev1alpha1.FieldLongNotStarted:
534-
for i := range ngs {
535-
val += ngs[i].Health.LongNotStarted
536-
}
545+
// Field deprecated, do nothing.
537546
case corev1alpha1.FieldRegistered:
538547
for i := range ngs {
539-
val += ngs[i].Health.Registered
548+
val += int32(ngs[i].Health.NodeCounts.Registered.Total)
540549
}
541550
case corev1alpha1.FieldLongUnregistered:
542551
for i := range ngs {
543-
val += ngs[i].Health.LongUnregistered
552+
val += int32(ngs[i].Health.NodeCounts.LongUnregistered)
544553
}
545554
case corev1alpha1.FieldCloudProviderTarget:
546555
for i := range ngs {
547-
val += ngs[i].Health.CloudProviderTarget
556+
val += int32(ngs[i].Health.CloudProviderTarget)
548557
}
549558
}
550559

551560
return val
552561
}
553562

554563
// matchPolicy returns if given NodeGroup match desired Scheduler policy.
555-
func matchPolicy(ngs []clusterautoscaler.NodeGroup, policy corev1alpha1.SchedulerPolicy) bool {
564+
func matchPolicy(ngs []clusterautoscaler.NodeGroupStatus, policy corev1alpha1.SchedulerPolicy) bool {
556565
left := nodeGroupIntOrFieldValue(ngs, policy.LeftOperand)
557566
right := nodeGroupIntOrFieldValue(ngs, policy.RightOperand)
558567

@@ -576,7 +585,7 @@ func matchPolicy(ngs []clusterautoscaler.NodeGroup, policy corev1alpha1.Schedule
576585
}
577586

578587
// replicas returns the number of required replicas.
579-
func nodeGroupReplicas(ngs []clusterautoscaler.NodeGroup, operation corev1alpha1.IntOrArithmeticOperation) int32 {
588+
func nodeGroupReplicas(ngs []clusterautoscaler.NodeGroupStatus, operation corev1alpha1.IntOrArithmeticOperation) int32 {
580589
if operation.OperationVal == nil {
581590
return operation.IntVal
582591
}

0 commit comments

Comments
 (0)