From 2bfc51bf4baac395dcc72e0c93d092370c0b7069 Mon Sep 17 00:00:00 2001 From: Maciek Pytel Date: Tue, 14 Jun 2022 15:06:48 +0200 Subject: [PATCH] Merge pull request kubernetes/autoscaler#4970 from MaciekPytel/estimation_limiter Binpacking can exit without packing all the pods --- .../config/autoscaling_options.go | 21 ++ cluster-autoscaler/core/autoscaler.go | 2 +- cluster-autoscaler/core/scale_test_common.go | 2 +- cluster-autoscaler/core/scale_up.go | 2 +- .../estimator/binpacking_estimator.go | 75 ++++++-- .../estimator/binpacking_estimator_test.go | 181 ++++++++++++------ cluster-autoscaler/estimator/estimator.go | 25 ++- .../estimator/threshold_based_limiter.go | 64 +++++++ .../estimator/threshold_based_limiter_test.go | 129 +++++++++++++ cluster-autoscaler/main.go | 23 ++- 10 files changed, 442 insertions(+), 82 deletions(-) create mode 100644 cluster-autoscaler/estimator/threshold_based_limiter.go create mode 100644 cluster-autoscaler/estimator/threshold_based_limiter_test.go diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go index 4dcd95245f59..a8ce50b6ac2e 100644 --- a/cluster-autoscaler/config/autoscaling_options.go +++ b/cluster-autoscaler/config/autoscaling_options.go @@ -165,4 +165,25 @@ type AutoscalingOptions struct { DaemonSetEvictionForOccupiedNodes bool // User agent to use for HTTP calls. UserAgent string + // InitialNodeGroupBackoffDuration is the duration of first backoff after a new node failed to start + InitialNodeGroupBackoffDuration time.Duration + // MaxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start. + MaxNodeGroupBackoffDuration time.Duration + // NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset. + NodeGroupBackoffResetTimeout time.Duration + // MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel. + MaxScaleDownParallelism int + // MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel. + MaxDrainParallelism int + // GceExpanderEphemeralStorageSupport is whether scale-up takes ephemeral storage resources into account. + GceExpanderEphemeralStorageSupport bool + // RecordDuplicatedEvents controls whether events should be duplicated within a 5 minute window. + RecordDuplicatedEvents bool + // MaxNodesPerScaleUp controls how many nodes can be added in a single scale-up. + // Note that this is strictly a performance optimization aimed at limiting binpacking time, not a tool to rate-limit + // scale-up. There is nothing stopping CA from adding MaxNodesPerScaleUp every loop. + MaxNodesPerScaleUp int + // MaxNodeGroupBinpackingDuration is a maximum time that can be spent binpacking a single NodeGroup. If the threshold + // is exceeded binpacking will be cut short and a partial scale-up will be performed. + MaxNodeGroupBinpackingDuration time.Duration } diff --git a/cluster-autoscaler/core/autoscaler.go b/cluster-autoscaler/core/autoscaler.go index 3735baaac083..178bc34fdf33 100644 --- a/cluster-autoscaler/core/autoscaler.go +++ b/cluster-autoscaler/core/autoscaler.go @@ -109,7 +109,7 @@ func initializeDefaultOptions(opts *AutoscalerOptions) error { opts.ExpanderStrategy = expanderStrategy } if opts.EstimatorBuilder == nil { - estimatorBuilder, err := estimator.NewEstimatorBuilder(opts.EstimatorName) + estimatorBuilder, err := estimator.NewEstimatorBuilder(opts.EstimatorName, estimator.NewThresholdBasedEstimationLimiter(opts.MaxNodesPerScaleUp, opts.MaxNodeGroupBinpackingDuration)) if err != nil { return err } diff --git a/cluster-autoscaler/core/scale_test_common.go b/cluster-autoscaler/core/scale_test_common.go index 5d4832584890..310f6ec38c82 100644 --- a/cluster-autoscaler/core/scale_test_common.go +++ b/cluster-autoscaler/core/scale_test_common.go @@ -161,7 +161,7 @@ func NewScaleTestAutoscalingContext( } // Ignoring error here is safe - if a test doesn't specify valid estimatorName, // it either doesn't need one, or should fail when it turns out to be nil. - estimatorBuilder, _ := estimator.NewEstimatorBuilder(options.EstimatorName) + estimatorBuilder, _ := estimator.NewEstimatorBuilder(options.EstimatorName, estimator.NewThresholdBasedEstimationLimiter(0, 0)) predicateChecker, err := simulator.NewTestPredicateChecker() if err != nil { return context.AutoscalingContext{}, err diff --git a/cluster-autoscaler/core/scale_up.go b/cluster-autoscaler/core/scale_up.go index 5cfb9339618c..9216bd627650 100644 --- a/cluster-autoscaler/core/scale_up.go +++ b/cluster-autoscaler/core/scale_up.go @@ -312,7 +312,7 @@ func computeExpansionOption(context *context.AutoscalingContext, podEquivalenceG if len(option.Pods) > 0 { estimator := context.EstimatorBuilder(context.PredicateChecker, context.ClusterSnapshot) - option.NodeCount = estimator.Estimate(option.Pods, nodeInfo) + option.NodeCount, option.Pods = estimator.Estimate(option.Pods, nodeInfo, option.NodeGroup) } return option, nil diff --git a/cluster-autoscaler/estimator/binpacking_estimator.go b/cluster-autoscaler/estimator/binpacking_estimator.go index 87482f4921f1..5e6134a56d53 100644 --- a/cluster-autoscaler/estimator/binpacking_estimator.go +++ b/cluster-autoscaler/estimator/binpacking_estimator.go @@ -22,6 +22,7 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/simulator" "k8s.io/autoscaler/cluster-autoscaler/utils/scheduler" klog "k8s.io/klog/v2" @@ -38,15 +39,18 @@ type podInfo struct { type BinpackingNodeEstimator struct { predicateChecker simulator.PredicateChecker clusterSnapshot simulator.ClusterSnapshot + limiter EstimationLimiter } // NewBinpackingNodeEstimator builds a new BinpackingNodeEstimator. func NewBinpackingNodeEstimator( predicateChecker simulator.PredicateChecker, - clusterSnapshot simulator.ClusterSnapshot) *BinpackingNodeEstimator { + clusterSnapshot simulator.ClusterSnapshot, + limiter EstimationLimiter) *BinpackingNodeEstimator { return &BinpackingNodeEstimator{ predicateChecker: predicateChecker, clusterSnapshot: clusterSnapshot, + limiter: limiter, } } @@ -57,69 +61,102 @@ func NewBinpackingNodeEstimator( // still be maintained. // It is assumed that all pods from the given list can fit to nodeTemplate. // Returns the number of nodes needed to accommodate all pods from the list. -func (estimator *BinpackingNodeEstimator) Estimate( +func (e *BinpackingNodeEstimator) Estimate( pods []*apiv1.Pod, - nodeTemplate *schedulerframework.NodeInfo) int { + nodeTemplate *schedulerframework.NodeInfo, + nodeGroup cloudprovider.NodeGroup) (int, []*apiv1.Pod) { + + e.limiter.StartEstimation(pods, nodeGroup) + defer e.limiter.EndEstimation() + podInfos := calculatePodScore(pods, nodeTemplate) sort.Slice(podInfos, func(i, j int) bool { return podInfos[i].score > podInfos[j].score }) newNodeNames := make(map[string]bool) + newNodesWithPods := make(map[string]bool) - if err := estimator.clusterSnapshot.Fork(); err != nil { + if err := e.clusterSnapshot.Fork(); err != nil { klog.Errorf("Error while calling ClusterSnapshot.Fork; %v", err) - return 0 + return 0, nil } defer func() { - if err := estimator.clusterSnapshot.Revert(); err != nil { + if err := e.clusterSnapshot.Revert(); err != nil { klog.Fatalf("Error while calling ClusterSnapshot.Revert; %v", err) } }() newNodeNameIndex := 0 + scheduledPods := []*apiv1.Pod{} + lastNodeName := "" for _, podInfo := range podInfos { found := false - nodeName, err := estimator.predicateChecker.FitsAnyNodeMatching(estimator.clusterSnapshot, podInfo.pod, func(nodeInfo *schedulerframework.NodeInfo) bool { + nodeName, err := e.predicateChecker.FitsAnyNodeMatching(e.clusterSnapshot, podInfo.pod, func(nodeInfo *schedulerframework.NodeInfo) bool { return newNodeNames[nodeInfo.Node().Name] }) if err == nil { found = true - if err := estimator.clusterSnapshot.AddPod(podInfo.pod, nodeName); err != nil { + if err := e.clusterSnapshot.AddPod(podInfo.pod, nodeName); err != nil { klog.Errorf("Error adding pod %v.%v to node %v in ClusterSnapshot; %v", podInfo.pod.Namespace, podInfo.pod.Name, nodeName, err) - return 0 + return 0, nil } + scheduledPods = append(scheduledPods, podInfo.pod) + newNodesWithPods[nodeName] = true } if !found { + // Stop binpacking if we reach the limit of nodes we can add. + // We return the result of the binpacking that we already performed. + if !e.limiter.PermissionToAddNode() { + break + } + + // If the last node we've added is empty and the pod couldn't schedule on it, it wouldn't be able to schedule + // on a new node either. There is no point adding more nodes to snapshot in such case, especially because of + // performance cost each extra node adds to future FitsAnyNodeMatching calls. + if lastNodeName != "" && !newNodesWithPods[lastNodeName] { + continue + } + // Add new node - newNodeName, err := estimator.addNewNodeToSnapshot(nodeTemplate, newNodeNameIndex) + newNodeName, err := e.addNewNodeToSnapshot(nodeTemplate, newNodeNameIndex) if err != nil { klog.Errorf("Error while adding new node for template to ClusterSnapshot; %v", err) - return 0 + return 0, nil } newNodeNameIndex++ - // And schedule pod to it - if err := estimator.clusterSnapshot.AddPod(podInfo.pod, newNodeName); err != nil { + newNodeNames[newNodeName] = true + lastNodeName = newNodeName + + // And try to schedule pod to it. + // Note that this may still fail (ex. if topology spreading with zonal topologyKey is used); + // in this case we can't help the pending pod. We keep the node in clusterSnapshot to avoid + // adding and removing node to snapshot for each such pod. + if err := e.predicateChecker.CheckPredicates(e.clusterSnapshot, podInfo.pod, newNodeName); err != nil { + continue + } + if err := e.clusterSnapshot.AddPod(podInfo.pod, newNodeName); err != nil { klog.Errorf("Error adding pod %v.%v to node %v in ClusterSnapshot; %v", podInfo.pod.Namespace, podInfo.pod.Name, newNodeName, err) - return 0 + return 0, nil } - newNodeNames[newNodeName] = true + newNodesWithPods[newNodeName] = true + scheduledPods = append(scheduledPods, podInfo.pod) } } - return len(newNodeNames) + return len(newNodesWithPods), scheduledPods } -func (estimator *BinpackingNodeEstimator) addNewNodeToSnapshot( +func (e *BinpackingNodeEstimator) addNewNodeToSnapshot( template *schedulerframework.NodeInfo, nameIndex int) (string, error) { - newNodeInfo := scheduler.DeepCopyTemplateNode(template, fmt.Sprintf("estimator-%d", nameIndex)) + newNodeInfo := scheduler.DeepCopyTemplateNode(template, fmt.Sprintf("e-%d", nameIndex)) var pods []*apiv1.Pod for _, podInfo := range newNodeInfo.Pods { pods = append(pods, podInfo.Pod) } - if err := estimator.clusterSnapshot.AddNodeWithPods(newNodeInfo.Node(), pods); err != nil { + if err := e.clusterSnapshot.AddNodeWithPods(newNodeInfo.Node(), pods); err != nil { return "", err } return newNodeInfo.Node().Name, nil diff --git a/cluster-autoscaler/estimator/binpacking_estimator_test.go b/cluster-autoscaler/estimator/binpacking_estimator_test.go index 6aa4ee1b84ec..797d3d28e742 100644 --- a/cluster-autoscaler/estimator/binpacking_estimator_test.go +++ b/cluster-autoscaler/estimator/binpacking_estimator_test.go @@ -22,6 +22,7 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/autoscaler/cluster-autoscaler/simulator" . "k8s.io/autoscaler/cluster-autoscaler/utils/test" "k8s.io/autoscaler/cluster-autoscaler/utils/units" @@ -30,89 +31,157 @@ import ( "github.com/stretchr/testify/assert" ) -func makePod(cpuPerPod, memoryPerPod int64) *apiv1.Pod { - return &apiv1.Pod{ +func makePods(cpuPerPod int64, memoryPerPod int64, hostport int32, maxSkew int32, topologySpreadingKey string, podCount int) []*apiv1.Pod { + pod := &apiv1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "estimatee", + Namespace: "universe", + Labels: map[string]string{ + "app": "estimatee", + }, + }, Spec: apiv1.PodSpec{ Containers: []apiv1.Container{ { Resources: apiv1.ResourceRequirements{ Requests: apiv1.ResourceList{ apiv1.ResourceCPU: *resource.NewMilliQuantity(cpuPerPod, resource.DecimalSI), - apiv1.ResourceMemory: *resource.NewQuantity(memoryPerPod, resource.DecimalSI), + apiv1.ResourceMemory: *resource.NewQuantity(memoryPerPod*units.MiB, resource.DecimalSI), }, }, }, }, }, } -} - -func TestBinpackingEstimate(t *testing.T) { - estimator := newBinPackingEstimator(t) - - cpuPerPod := int64(350) - memoryPerPod := int64(1000 * units.MiB) - pod := makePod(cpuPerPod, memoryPerPod) - - pods := make([]*apiv1.Pod, 0) - for i := 0; i < 10; i++ { + if hostport > 0 { + pod.Spec.Containers[0].Ports = []apiv1.ContainerPort{ + { + HostPort: hostport, + }, + } + } + if maxSkew > 0 { + pod.Spec.TopologySpreadConstraints = []apiv1.TopologySpreadConstraint{ + { + MaxSkew: maxSkew, + TopologyKey: topologySpreadingKey, + WhenUnsatisfiable: "DoNotSchedule", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "estimatee", + }, + }, + }, + } + } + pods := []*apiv1.Pod{} + for i := 0; i < podCount; i++ { pods = append(pods, pod) } + return pods +} + +func makeNode(cpu int64, mem int64, name string, zone string) *apiv1.Node { node := &apiv1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + "kubernetes.io/hostname": name, + "topology.kubernetes.io/zone": zone, + }, + }, Status: apiv1.NodeStatus{ Capacity: apiv1.ResourceList{ - apiv1.ResourceCPU: *resource.NewMilliQuantity(cpuPerPod*3-50, resource.DecimalSI), - apiv1.ResourceMemory: *resource.NewQuantity(2*memoryPerPod, resource.DecimalSI), + apiv1.ResourceCPU: *resource.NewMilliQuantity(cpu, resource.DecimalSI), + apiv1.ResourceMemory: *resource.NewQuantity(mem*units.MiB, resource.DecimalSI), apiv1.ResourcePods: *resource.NewQuantity(10, resource.DecimalSI), }, }, } node.Status.Allocatable = node.Status.Capacity SetNodeReadyState(node, true, time.Time{}) - - nodeInfo := schedulerframework.NewNodeInfo() - nodeInfo.SetNode(node) - estimate := estimator.Estimate(pods, nodeInfo) - assert.Equal(t, 5, estimate) + return node } -func TestBinpackingEstimateWithPorts(t *testing.T) { - estimator := newBinPackingEstimator(t) - - cpuPerPod := int64(200) - memoryPerPod := int64(1000 * units.MiB) - pod := makePod(cpuPerPod, memoryPerPod) - pod.Spec.Containers[0].Ports = []apiv1.ContainerPort{ +func TestBinpackingEstimate(t *testing.T) { + testCases := []struct { + name string + millicores int64 + memory int64 + maxNodes int + pods []*apiv1.Pod + topologySpreadingKey string + expectNodeCount int + expectPodCount int + }{ { - HostPort: 5555, + name: "simple resource-based binpacking", + millicores: 350*3 - 50, + memory: 2 * 1000, + pods: makePods(350, 1000, 0, 0, "", 10), + expectNodeCount: 5, + expectPodCount: 10, }, - } - pods := make([]*apiv1.Pod, 0) - for i := 0; i < 8; i++ { - pods = append(pods, pod) - } - node := &apiv1.Node{ - Status: apiv1.NodeStatus{ - Capacity: apiv1.ResourceList{ - apiv1.ResourceCPU: *resource.NewMilliQuantity(5*cpuPerPod, resource.DecimalSI), - apiv1.ResourceMemory: *resource.NewQuantity(5*memoryPerPod, resource.DecimalSI), - apiv1.ResourcePods: *resource.NewQuantity(10, resource.DecimalSI), - }, + { + name: "pods-per-node bound binpacking", + millicores: 10000, + memory: 20000, + pods: makePods(10, 100, 0, 0, "", 20), + expectNodeCount: 2, + expectPodCount: 20, + }, + { + name: "hostport conflict forces pod-per-node", + millicores: 1000, + memory: 5000, + pods: makePods(200, 1000, 5555, 0, "", 8), + expectNodeCount: 8, + expectPodCount: 8, + }, + { + name: "limiter cuts binpacking", + millicores: 1000, + memory: 5000, + pods: makePods(500, 1000, 0, 0, "", 20), + maxNodes: 5, + expectNodeCount: 5, + expectPodCount: 10, + }, + { + name: "hostname topology spreading with maxSkew=2 forces 2 pods/node", + millicores: 1000, + memory: 5000, + pods: makePods(20, 100, 0, 2, "kubernetes.io/hostname", 8), + expectNodeCount: 4, + expectPodCount: 8, + }, + { + name: "zonal topology spreading with maxSkew=2 only allows 2 pods to schedule", + millicores: 1000, + memory: 5000, + pods: makePods(20, 100, 0, 2, "topology.kubernetes.io/zone", 8), + expectNodeCount: 1, + expectPodCount: 2, }, } - node.Status.Allocatable = node.Status.Capacity - SetNodeReadyState(node, true, time.Time{}) - - nodeInfo := schedulerframework.NewNodeInfo() - nodeInfo.SetNode(node) - estimate := estimator.Estimate(pods, nodeInfo) - assert.Equal(t, 8, estimate) -} - -func newBinPackingEstimator(t *testing.T) *BinpackingNodeEstimator { - predicateChecker, err := simulator.NewTestPredicateChecker() - clusterSnapshot := simulator.NewBasicClusterSnapshot() - assert.NoError(t, err) - estimator := NewBinpackingNodeEstimator(predicateChecker, clusterSnapshot) - return estimator + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + clusterSnapshot := simulator.NewBasicClusterSnapshot() + // Add one node in different zone to trigger topology spread constraints + clusterSnapshot.AddNode(makeNode(100, 100, "oldnode", "zone-jupiter")) + + predicateChecker, err := simulator.NewTestPredicateChecker() + assert.NoError(t, err) + limiter := NewThresholdBasedEstimationLimiter(tc.maxNodes, time.Duration(0)) + estimator := NewBinpackingNodeEstimator(predicateChecker, clusterSnapshot, limiter) + + node := makeNode(tc.millicores, tc.memory, "template", "zone-mars") + nodeInfo := schedulerframework.NewNodeInfo() + nodeInfo.SetNode(node) + + estimatedNodes, estimatedPods := estimator.Estimate(tc.pods, nodeInfo, nil) + assert.Equal(t, tc.expectNodeCount, estimatedNodes) + assert.Equal(t, tc.expectPodCount, len(estimatedPods)) + }) + } } diff --git a/cluster-autoscaler/estimator/estimator.go b/cluster-autoscaler/estimator/estimator.go index 8a86ec9c66b5..7d4f819dcf19 100644 --- a/cluster-autoscaler/estimator/estimator.go +++ b/cluster-autoscaler/estimator/estimator.go @@ -20,6 +20,7 @@ import ( "fmt" apiv1 "k8s.io/api/core/v1" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" "k8s.io/autoscaler/cluster-autoscaler/simulator" schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" ) @@ -33,22 +34,40 @@ const ( var AvailableEstimators = []string{BinpackingEstimatorName} // Estimator calculates the number of nodes of given type needed to schedule pods. +// It returns the number of new nodes needed as well as the list of pods it managed +// to schedule on those nodes. type Estimator interface { - Estimate([]*apiv1.Pod, *schedulerframework.NodeInfo) int + Estimate([]*apiv1.Pod, *schedulerframework.NodeInfo, cloudprovider.NodeGroup) (int, []*apiv1.Pod) } // EstimatorBuilder creates a new estimator object. type EstimatorBuilder func(simulator.PredicateChecker, simulator.ClusterSnapshot) Estimator // NewEstimatorBuilder creates a new estimator object from flag. -func NewEstimatorBuilder(name string) (EstimatorBuilder, error) { +func NewEstimatorBuilder(name string, limiter EstimationLimiter) (EstimatorBuilder, error) { switch name { case BinpackingEstimatorName: return func( predicateChecker simulator.PredicateChecker, clusterSnapshot simulator.ClusterSnapshot) Estimator { - return NewBinpackingNodeEstimator(predicateChecker, clusterSnapshot) + return NewBinpackingNodeEstimator(predicateChecker, clusterSnapshot, limiter) }, nil } return nil, fmt.Errorf("unknown estimator: %s", name) } + +// EstimationLimiter controls how many nodes can be added by Estimator. +// A limiter can be used to prevent costly estimation if an actual ability to +// scale-up is limited by external factors. +type EstimationLimiter interface { + // StartEstimation is called at the start of estimation. + StartEstimation([]*apiv1.Pod, cloudprovider.NodeGroup) + // EndEstimation is called at the end of estimation. + EndEstimation() + // PermissionToAddNode is called by an estimator when it wants to add additional + // nodes to simulation. If permission is not granted the Estimator is expected + // not to add any more nodes in this simulation. + // There is no requirement for the Estimator to stop calculations, it's + // just not expected to add any more nodes. + PermissionToAddNode() bool +} diff --git a/cluster-autoscaler/estimator/threshold_based_limiter.go b/cluster-autoscaler/estimator/threshold_based_limiter.go new file mode 100644 index 000000000000..4381721d0c53 --- /dev/null +++ b/cluster-autoscaler/estimator/threshold_based_limiter.go @@ -0,0 +1,64 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package estimator + +import ( + "time" + + apiv1 "k8s.io/api/core/v1" + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" + klog "k8s.io/klog/v2" +) + +type thresholdBasedEstimationLimiter struct { + maxDuration time.Duration + maxNodes int + nodes int + start time.Time +} + +func (tbel *thresholdBasedEstimationLimiter) StartEstimation([]*apiv1.Pod, cloudprovider.NodeGroup) { + tbel.start = time.Now() + tbel.nodes = 0 +} + +func (*thresholdBasedEstimationLimiter) EndEstimation() {} + +func (tbel *thresholdBasedEstimationLimiter) PermissionToAddNode() bool { + if tbel.maxNodes > 0 && tbel.nodes >= tbel.maxNodes { + klog.V(4).Infof("Capping binpacking after exceeding threshold of %i nodes", tbel.maxNodes) + return false + } + timeDefined := tbel.maxDuration > 0 && tbel.start != time.Time{} + if timeDefined && time.Now().After(tbel.start.Add(tbel.maxDuration)) { + klog.V(4).Infof("Capping binpacking after exceeding max duration of %v", tbel.maxDuration) + return false + } + tbel.nodes++ + return true +} + +// NewThresholdBasedEstimationLimiter returns an EstimationLimiter that will prevent estimation +// after either a node count- of time-based threshold is reached. This is meant to prevent cases +// where binpacking of hundreds or thousands of nodes takes extremely long time rendering CA +// incredibly slow or even completely crashing it. +func NewThresholdBasedEstimationLimiter(maxNodes int, maxDuration time.Duration) EstimationLimiter { + return &thresholdBasedEstimationLimiter{ + maxNodes: maxNodes, + maxDuration: maxDuration, + } +} diff --git a/cluster-autoscaler/estimator/threshold_based_limiter_test.go b/cluster-autoscaler/estimator/threshold_based_limiter_test.go new file mode 100644 index 000000000000..e80b586f3ebb --- /dev/null +++ b/cluster-autoscaler/estimator/threshold_based_limiter_test.go @@ -0,0 +1,129 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package estimator + +import ( + "testing" + "time" + + apiv1 "k8s.io/api/core/v1" + + "github.com/stretchr/testify/assert" +) + +type limiterOperation func(*testing.T, EstimationLimiter) + +func expectDeny(t *testing.T, l EstimationLimiter) { + assert.Equal(t, false, l.PermissionToAddNode()) +} + +func expectAllow(t *testing.T, l EstimationLimiter) { + assert.Equal(t, true, l.PermissionToAddNode()) +} + +func resetLimiter(t *testing.T, l EstimationLimiter) { + l.EndEstimation() + l.StartEstimation([]*apiv1.Pod{}, nil) +} + +func TestThresholdBasedLimiter(t *testing.T) { + testCases := []struct { + name string + maxNodes int + maxDuration time.Duration + startDelta time.Duration + operations []limiterOperation + expectNodeCount int + }{ + { + name: "no limiting happens", + maxNodes: 20, + operations: []limiterOperation{ + expectAllow, + expectAllow, + expectAllow, + }, + expectNodeCount: 3, + }, + { + name: "time based trigger fires", + maxNodes: 20, + maxDuration: 5 * time.Second, + startDelta: -10 * time.Second, + operations: []limiterOperation{ + expectDeny, + expectDeny, + }, + expectNodeCount: 0, + }, + { + name: "sequence of additions works until the threshold is hit", + maxNodes: 3, + operations: []limiterOperation{ + expectAllow, + expectAllow, + expectAllow, + expectDeny, + }, + expectNodeCount: 3, + }, + { + name: "node counter is reset", + maxNodes: 2, + operations: []limiterOperation{ + expectAllow, + expectAllow, + expectDeny, + resetLimiter, + expectAllow, + }, + expectNodeCount: 1, + }, + { + name: "timer is reset", + maxNodes: 20, + maxDuration: 5 * time.Second, + startDelta: -10 * time.Second, + operations: []limiterOperation{ + expectDeny, + resetLimiter, + expectAllow, + expectAllow, + }, + expectNodeCount: 2, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + limiter := &thresholdBasedEstimationLimiter{ + maxNodes: tc.maxNodes, + maxDuration: tc.maxDuration, + } + limiter.StartEstimation([]*apiv1.Pod{}, nil) + + if tc.startDelta != time.Duration(0) { + limiter.start = limiter.start.Add(tc.startDelta) + } + + for _, op := range tc.operations { + op(t, limiter) + } + assert.Equal(t, tc.expectNodeCount, limiter.nodes) + limiter.EndEstimation() + }) + } +} diff --git a/cluster-autoscaler/main.go b/cluster-autoscaler/main.go index 4a77d3febb40..1bcbcd5cc8a2 100644 --- a/cluster-autoscaler/main.go +++ b/cluster-autoscaler/main.go @@ -182,7 +182,19 @@ var ( daemonSetEvictionForOccupiedNodes = flag.Bool("daemonset-eviction-for-occupied-nodes", true, "DaemonSet pods will be gracefully terminated from non-empty nodes") userAgent = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.") - emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.") + emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.") + initialNodeGroupBackoffDuration = flag.Duration("initial-node-group-backoff-duration", 5*time.Minute, + "initialNodeGroupBackoffDuration is the duration of first backoff after a new node failed to start.") + maxNodeGroupBackoffDuration = flag.Duration("max-node-group-backoff-duration", 30*time.Minute, + "maxNodeGroupBackoffDuration is the maximum backoff duration for a NodeGroup after new nodes failed to start.") + nodeGroupBackoffResetTimeout = flag.Duration("node-group-backoff-reset-timeout", 3*time.Hour, + "nodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.") + maxScaleDownParallelismFlag = flag.Int("max-scale-down-parallelism", 10, "Maximum number of nodes (both empty and needing drain) that can be deleted in parallel.") + maxDrainParallelismFlag = flag.Int("max-drain-parallelism", 1, "Maximum number of nodes needing drain, that can be drained and deleted in parallel.") + gceExpanderEphemeralStorageSupport = flag.Bool("gce-expander-ephemeral-storage-support", false, "Whether scale-up takes ephemeral storage resources into account for GCE cloud provider") + recordDuplicatedEvents = flag.Bool("record-duplicated-events", false, "enable duplication of similar events within a 5 minute window.") + maxNodesPerScaleUp = flag.Int("max-nodes-per-scaleup", 1000, "Max nodes added in a single scale-up. This is intended strictly for optimizing CA algorithm latency and not a tool to rate-limit scale-up throughput.") + maxNodeGroupBinpackingDuration = flag.Duration("max-nodegroup-binpacking-duration", 10*time.Second, "Maximum time that will be spent in binpacking simulation for each NodeGroup.") ) func createAutoscalingOptions() config.AutoscalingOptions { @@ -260,6 +272,15 @@ func createAutoscalingOptions() config.AutoscalingOptions { DaemonSetEvictionForEmptyNodes: *daemonSetEvictionForEmptyNodes, DaemonSetEvictionForOccupiedNodes: *daemonSetEvictionForOccupiedNodes, UserAgent: *userAgent, + InitialNodeGroupBackoffDuration: *initialNodeGroupBackoffDuration, + MaxNodeGroupBackoffDuration: *maxNodeGroupBackoffDuration, + NodeGroupBackoffResetTimeout: *nodeGroupBackoffResetTimeout, + MaxScaleDownParallelism: *maxScaleDownParallelismFlag, + MaxDrainParallelism: *maxDrainParallelismFlag, + GceExpanderEphemeralStorageSupport: *gceExpanderEphemeralStorageSupport, + RecordDuplicatedEvents: *recordDuplicatedEvents, + MaxNodesPerScaleUp: *maxNodesPerScaleUp, + MaxNodeGroupBinpackingDuration: *maxNodeGroupBinpackingDuration, } }