Skip to content

Commit

Permalink
Merge pull request #6477 from kushagra98/atomic-nodes
Browse files Browse the repository at this point in the history
Consider atomic nodes
  • Loading branch information
k8s-ci-robot authored Jan 31, 2024
2 parents ed6ebbe + a5502e3 commit 7e95c7e
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 2 deletions.
28 changes: 26 additions & 2 deletions cluster-autoscaler/core/scaledown/planner/planner.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (

apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown"
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/eligibility"
Expand Down Expand Up @@ -261,6 +262,7 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
unremovableTimeout := p.latestUpdate.Add(p.context.AutoscalingOptions.UnremovableNodeRecheckTimeout)
unremovableCount := 0
var removableList []simulator.NodeToBeRemoved
atomicScaleDownNodesCount := 0
p.unremovableNodes.Update(p.context.ClusterSnapshot.NodeInfos(), p.latestUpdate)
currentlyUnneededNodeNames, utilizationMap, ineligible := p.eligibilityChecker.FilterOutUnremovable(p.context, scaleDownCandidates, p.latestUpdate, p.unremovableNodes)
for _, n := range ineligible {
Expand All @@ -274,8 +276,8 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
klog.Warningf("%d out of %d nodes skipped in scale down simulation due to timeout.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames))
break
}
if len(removableList) >= p.unneededNodesLimit() {
klog.V(4).Infof("%d out of %d nodes skipped in scale down simulation: there are already %d unneeded nodes so no point in looking for more.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames), len(removableList))
if len(removableList)-atomicScaleDownNodesCount >= p.unneededNodesLimit() {
klog.V(4).Infof("%d out of %d nodes skipped in scale down simulation: there are already %d unneeded nodes so no point in looking for more. Total atomic scale down nodes: %d", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames), len(removableList), atomicScaleDownNodesCount)
break
}
removable, unremovable := p.rs.SimulateNodeRemoval(node, podDestinations, p.latestUpdate, p.context.RemainingPdbTracker)
Expand All @@ -287,6 +289,10 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
delete(podDestinations, removable.Node.Name)
p.context.RemainingPdbTracker.RemovePods(removable.PodsToReschedule)
removableList = append(removableList, *removable)
if p.atomicScaleDownNode(removable) {
atomicScaleDownNodesCount++
klog.V(2).Infof("Considering node %s for atomic scale down. Total atomic scale down nodes count: %d", removable.Node.Name, atomicScaleDownNodesCount)
}
}
if unremovable != nil {
unremovableCount += 1
Expand All @@ -299,6 +305,24 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
}
}

// atomicScaleDownNode checks if the removable node would be considered for atomic scale down.
func (p *Planner) atomicScaleDownNode(node *simulator.NodeToBeRemoved) bool {
nodeGroup, err := p.context.CloudProvider.NodeGroupForNode(node.Node)
if err != nil {
klog.Errorf("failed to get node info for %v: %s", node.Node.Name, err)
return false
}
autoscalingOptions, err := nodeGroup.GetOptions(p.context.NodeGroupDefaults)
if err != nil && err != cloudprovider.ErrNotImplemented {
klog.Errorf("Failed to get autoscaling options for node group %s: %v", nodeGroup.Id(), err)
return false
}
if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling {
return true
}
return false
}

// unneededNodesLimit returns the number of nodes after which calculating more
// unneeded nodes is a waste of time. The reasoning behind it is essentially as
// follows.
Expand Down
81 changes: 81 additions & 0 deletions cluster-autoscaler/core/scaledown/planner/planner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,10 @@ func TestUpdateClusterState(t *testing.T) {
assert.NoError(t, err)
registry := kube_util.NewListerRegistry(nil, nil, nil, nil, nil, nil, nil, rsLister, nil)
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 0, 0, 0)
for _, node := range tc.nodes {
provider.AddNode("ng1", node)
}
context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{
NodeGroupDefaults: config.NodeGroupAutoscalingOptions{
ScaleDownUnneededTime: 10 * time.Minute,
Expand Down Expand Up @@ -521,6 +525,7 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) {
name string
previouslyUnneeded int
nodes int
opts *config.NodeGroupAutoscalingOptions
maxParallelism int
maxUnneededTime time.Duration
updateInterval time.Duration
Expand Down Expand Up @@ -589,6 +594,78 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) {
updateInterval: 30 * time.Second,
wantUnneeded: 30,
},
{
name: "atomic sclale down - default settings",
previouslyUnneeded: 5,
nodes: 100,
maxParallelism: 10,
maxUnneededTime: 1 * time.Minute,
updateInterval: 10 * time.Second,
wantUnneeded: 100,
opts: &config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
},
},
{
name: "atomic sclale down - quick loops",
previouslyUnneeded: 5,
nodes: 100,
maxParallelism: 10,
maxUnneededTime: 1 * time.Minute,
updateInterval: 1 * time.Second,
wantUnneeded: 100,
opts: &config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
},
},
{
name: "atomic sclale down - slow loops",
previouslyUnneeded: 5,
nodes: 100,
maxParallelism: 10,
maxUnneededTime: 1 * time.Minute,
updateInterval: 30 * time.Second,
wantUnneeded: 100,
opts: &config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
},
},
{
name: "atomic sclale down - too many unneeded",
previouslyUnneeded: 77,
nodes: 100,
maxParallelism: 10,
maxUnneededTime: 1 * time.Minute,
updateInterval: 30 * time.Second,
wantUnneeded: 100,
opts: &config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
},
},
{
name: "atomic sclale down - no uneeded",
previouslyUnneeded: 0,
nodes: 100,
maxParallelism: 10,
maxUnneededTime: 1 * time.Minute,
updateInterval: 30 * time.Second,
wantUnneeded: 100,
opts: &config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
},
},
{
name: "atomic sclale down - short uneeded time and short update interval",
previouslyUnneeded: 0,
nodes: 500,
maxParallelism: 1,
maxUnneededTime: 1 * time.Second,
updateInterval: 1 * time.Second,
wantUnneeded: 500,
opts: &config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
},
},
}
for _, tc := range testCases {
tc := tc
Expand All @@ -603,6 +680,10 @@ func TestUpdateClusterStatUnneededNodesLimit(t *testing.T) {
previouslyUnneeded[i] = simulator.NodeToBeRemoved{Node: nodes[i]}
}
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroupWithCustomOptions("ng1", 0, 0, 0, tc.opts)
for _, node := range nodes {
provider.AddNode("ng1", node)
}
context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{
NodeGroupDefaults: config.NodeGroupAutoscalingOptions{
ScaleDownUnneededTime: tc.maxUnneededTime,
Expand Down

0 comments on commit 7e95c7e

Please sign in to comment.