Skip to content

Commit

Permalink
make MaxKubernetesEmptyNodeDeletionTime not configurable - just as up…
Browse files Browse the repository at this point in the history
…stream
  • Loading branch information
gandhipr committed Feb 27, 2024
1 parent bfb24ba commit dabc007
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 72 deletions.
2 changes: 0 additions & 2 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,6 @@ type AutoscalingOptions struct {
DrainPriorityConfig []kubelet_config.ShutdownGracePeriodByPodPriority
// MaxCloudProviderNodeDeletionTime is the maximum time needed by cloud provider to delete a node
MaxCloudProviderNodeDeletionTime time.Duration
// MaxKubernetesEmptyNodeDeletionTime is the maximum time needed by Kubernetes to delete an empty node
MaxKubernetesEmptyNodeDeletionTime time.Duration
// Maximum time CA waits for node to be provisioned
// MaxNodeProvisionTime defines maximum time CA waits for node to be provisioned
MaxNodeProvisionTime time.Duration
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
)

const (
// MaxKubernetesEmptyNodeDeletionTime is the maximum time needed by Kubernetes to delete an empty node.
MaxKubernetesEmptyNodeDeletionTime = 3 * time.Minute
)

// NodeDeletionBatcher batch scale down candidates for one node group and remove them.
type NodeDeletionBatcher struct {
sync.Mutex
Expand Down Expand Up @@ -168,7 +173,7 @@ func nodeScaleDownReason(node *apiv1.Node, drain bool) metrics.NodeScaleDownReas
// IsNodeBeingDeleted returns true iff a given node is being deleted.
func IsNodeBeingDeleted(ac *context.AutoscalingContext, node *apiv1.Node, timestamp time.Time) bool {
deleteTime, _ := taints.GetToBeDeletedTime(node)
return deleteTime != nil && (timestamp.Sub(*deleteTime) < ac.MaxCloudProviderNodeDeletionTime || timestamp.Sub(*deleteTime) < ac.MaxKubernetesEmptyNodeDeletionTime)
return deleteTime != nil && (timestamp.Sub(*deleteTime) < ac.MaxCloudProviderNodeDeletionTime || timestamp.Sub(*deleteTime) < MaxKubernetesEmptyNodeDeletionTime)
}

// CleanUpAndRecordFailedScaleDownEvent record failed scale down event and log an error.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,7 @@ func TestFilterOutUnremovable(t *testing.T) {
ScaleDownUnreadyTime: config.DefaultScaleDownUnreadyTime,
IgnoreDaemonSetsUtilization: tc.ignoreDaemonSetsUtilization,
},
MaxCloudProviderNodeDeletionTime: 5 * time.Minute,
MaxKubernetesEmptyNodeDeletionTime: 3 * time.Minute,
MaxCloudProviderNodeDeletionTime: 5 * time.Minute,
}
s := nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults)
c := NewChecker(s)
Expand Down
5 changes: 2 additions & 3 deletions cluster-autoscaler/core/scaledown/legacy/legacy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,8 @@ func TestFindUnneededNodes(t *testing.T) {
NodeGroupDefaults: config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.35,
},
UnremovableNodeRecheckTimeout: 5 * time.Minute,
MaxCloudProviderNodeDeletionTime: 5 * time.Minute,
MaxKubernetesEmptyNodeDeletionTime: 3 * time.Minute,
UnremovableNodeRecheckTimeout: 5 * time.Minute,
MaxCloudProviderNodeDeletionTime: 5 * time.Minute,
}
context, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, registry, provider, nil, nil)
assert.NoError(t, err)
Expand Down
6 changes: 3 additions & 3 deletions cluster-autoscaler/core/scaledown/resource/limits_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ package resource

import (
"fmt"
"k8s.io/autoscaler/cluster-autoscaler/config"
"testing"
"time"

"k8s.io/autoscaler/cluster-autoscaler/config"

. "k8s.io/autoscaler/cluster-autoscaler/core/test"
"k8s.io/autoscaler/cluster-autoscaler/core/utils"
"k8s.io/autoscaler/cluster-autoscaler/utils/taints"
Expand Down Expand Up @@ -57,8 +58,7 @@ func TestCalculateCoresAndMemoryTotal(t *testing.T) {
}

options := config.AutoscalingOptions{
MaxCloudProviderNodeDeletionTime: 5 * time.Minute,
MaxKubernetesEmptyNodeDeletionTime: 3 * time.Minute,
MaxCloudProviderNodeDeletionTime: 5 * time.Minute,
}
context, err := NewScaleTestAutoscalingContext(options, nil, nil, nil, nil, nil)
assert.NoError(t, err)
Expand Down
120 changes: 59 additions & 61 deletions cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,15 +160,14 @@ var (
maxEmptyBulkDeleteFlag = flag.Int("max-empty-bulk-delete", 10, "Maximum number of empty nodes that can be deleted at the same time.")
maxGracefulTerminationFlag = flag.Int("max-graceful-termination-sec", 10*60, "Maximum number of seconds CA waits for pod termination when trying to scale down a node. "+
"This flag is mutually exclusion with drain-priority-config flag which allows more configuration options.")
maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations")
okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
maxCloudProviderNodeDeletionTime = flag.Duration("max-cloud-provider-node-deletion-time", 5*time.Minute, "Maximum time needed by cloud provider to delete a node")
maxKubernetesEmptyNodeDeletionTime = flag.Duration("max-kubernetes-empty-node-deletion-time", 3*time.Minute, "Maximum time needed by cloud provider to delete a node")
maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
nodeGroupsFlag = multiStringFlag(
maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 45, "Maximum percentage of unready nodes in the cluster. After this is exceeded, CA halts operations")
okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
scaleUpFromZero = flag.Bool("scale-up-from-zero", true, "Should CA scale up when there are 0 ready nodes.")
parallelScaleUp = flag.Bool("parallel-scale-up", false, "Whether to allow parallel node groups scale up. Experimental: may not work on some cloud providers, enable at your own risk.")
maxCloudProviderNodeDeletionTime = flag.Duration("max-cloud-provider-node-deletion-time", 5*time.Minute, "Maximum time needed by cloud provider to delete a node")
maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "The default maximum time CA waits for node to be provisioned - the value can be overridden per node group")
maxPodEvictionTime = flag.Duration("max-pod-eviction-time", 2*time.Minute, "Maximum time CA tries to evict a pod before giving up")
nodeGroupsFlag = multiStringFlag(
"nodes",
"sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: <min>:<max>:<other...>")
nodeGroupAutoDiscoveryFlag = multiStringFlag(
Expand Down Expand Up @@ -342,58 +341,57 @@ func createAutoscalingOptions() config.AutoscalingOptions {
IgnoreDaemonSetsUtilization: *ignoreDaemonSetsUtilization,
MaxNodeProvisionTime: *maxNodeProvisionTime,
},
CloudConfig: *cloudConfig,
CloudProviderName: *cloudProviderFlag,
NodeGroupAutoDiscovery: *nodeGroupAutoDiscoveryFlag,
MaxTotalUnreadyPercentage: *maxTotalUnreadyPercentage,
OkTotalUnreadyCount: *okTotalUnreadyCount,
ScaleUpFromZero: *scaleUpFromZero,
ParallelScaleUp: *parallelScaleUp,
EstimatorName: *estimatorFlag,
ExpanderNames: *expanderFlag,
GRPCExpanderCert: *grpcExpanderCert,
GRPCExpanderURL: *grpcExpanderURL,
IgnoreMirrorPodsUtilization: *ignoreMirrorPodsUtilization,
MaxBulkSoftTaintCount: *maxBulkSoftTaintCount,
MaxBulkSoftTaintTime: *maxBulkSoftTaintTime,
MaxEmptyBulkDelete: *maxEmptyBulkDeleteFlag,
MaxCloudProviderNodeDeletionTime: *maxCloudProviderNodeDeletionTime,
MaxKubernetesEmptyNodeDeletionTime: *maxKubernetesEmptyNodeDeletionTime,
MaxGracefulTerminationSec: *maxGracefulTerminationFlag,
MaxPodEvictionTime: *maxPodEvictionTime,
MaxNodesTotal: *maxNodesTotal,
MaxCoresTotal: maxCoresTotal,
MinCoresTotal: minCoresTotal,
MaxMemoryTotal: maxMemoryTotal,
MinMemoryTotal: minMemoryTotal,
GpuTotal: parsedGpuTotal,
NodeGroups: *nodeGroupsFlag,
EnforceNodeGroupMinSize: *enforceNodeGroupMinSize,
ScaleDownDelayAfterAdd: *scaleDownDelayAfterAdd,
ScaleDownDelayAfterDelete: *scaleDownDelayAfterDelete,
ScaleDownDelayAfterFailure: *scaleDownDelayAfterFailure,
ScaleDownEnabled: *scaleDownEnabled,
ScaleDownUnreadyEnabled: *scaleDownUnreadyEnabled,
ScaleDownNonEmptyCandidatesCount: *scaleDownNonEmptyCandidatesCount,
ScaleDownCandidatesPoolRatio: *scaleDownCandidatesPoolRatio,
ScaleDownCandidatesPoolMinCount: *scaleDownCandidatesPoolMinCount,
DrainPriorityConfig: drainPriorityConfigMap,
SchedulerConfig: parsedSchedConfig,
WriteStatusConfigMap: *writeStatusConfigMapFlag,
StatusConfigMapName: *statusConfigMapName,
BalanceSimilarNodeGroups: *balanceSimilarNodeGroupsFlag,
ConfigNamespace: *namespace,
ClusterName: *clusterName,
NodeAutoprovisioningEnabled: *nodeAutoprovisioningEnabled,
MaxAutoprovisionedNodeGroupCount: *maxAutoprovisionedNodeGroupCount,
UnremovableNodeRecheckTimeout: *unremovableNodeRecheckTimeout,
ExpendablePodsPriorityCutoff: *expendablePodsPriorityCutoff,
Regional: *regional,
NewPodScaleUpDelay: *newPodScaleUpDelay,
StartupTaints: append(*ignoreTaintsFlag, *startupTaintsFlag...),
StatusTaints: *statusTaintsFlag,
BalancingExtraIgnoredLabels: *balancingIgnoreLabelsFlag,
BalancingLabels: *balancingLabelsFlag,
CloudConfig: *cloudConfig,
CloudProviderName: *cloudProviderFlag,
NodeGroupAutoDiscovery: *nodeGroupAutoDiscoveryFlag,
MaxTotalUnreadyPercentage: *maxTotalUnreadyPercentage,
OkTotalUnreadyCount: *okTotalUnreadyCount,
ScaleUpFromZero: *scaleUpFromZero,
ParallelScaleUp: *parallelScaleUp,
EstimatorName: *estimatorFlag,
ExpanderNames: *expanderFlag,
GRPCExpanderCert: *grpcExpanderCert,
GRPCExpanderURL: *grpcExpanderURL,
IgnoreMirrorPodsUtilization: *ignoreMirrorPodsUtilization,
MaxBulkSoftTaintCount: *maxBulkSoftTaintCount,
MaxBulkSoftTaintTime: *maxBulkSoftTaintTime,
MaxEmptyBulkDelete: *maxEmptyBulkDeleteFlag,
MaxCloudProviderNodeDeletionTime: *maxCloudProviderNodeDeletionTime,
MaxGracefulTerminationSec: *maxGracefulTerminationFlag,
MaxPodEvictionTime: *maxPodEvictionTime,
MaxNodesTotal: *maxNodesTotal,
MaxCoresTotal: maxCoresTotal,
MinCoresTotal: minCoresTotal,
MaxMemoryTotal: maxMemoryTotal,
MinMemoryTotal: minMemoryTotal,
GpuTotal: parsedGpuTotal,
NodeGroups: *nodeGroupsFlag,
EnforceNodeGroupMinSize: *enforceNodeGroupMinSize,
ScaleDownDelayAfterAdd: *scaleDownDelayAfterAdd,
ScaleDownDelayAfterDelete: *scaleDownDelayAfterDelete,
ScaleDownDelayAfterFailure: *scaleDownDelayAfterFailure,
ScaleDownEnabled: *scaleDownEnabled,
ScaleDownUnreadyEnabled: *scaleDownUnreadyEnabled,
ScaleDownNonEmptyCandidatesCount: *scaleDownNonEmptyCandidatesCount,
ScaleDownCandidatesPoolRatio: *scaleDownCandidatesPoolRatio,
ScaleDownCandidatesPoolMinCount: *scaleDownCandidatesPoolMinCount,
DrainPriorityConfig: drainPriorityConfigMap,
SchedulerConfig: parsedSchedConfig,
WriteStatusConfigMap: *writeStatusConfigMapFlag,
StatusConfigMapName: *statusConfigMapName,
BalanceSimilarNodeGroups: *balanceSimilarNodeGroupsFlag,
ConfigNamespace: *namespace,
ClusterName: *clusterName,
NodeAutoprovisioningEnabled: *nodeAutoprovisioningEnabled,
MaxAutoprovisionedNodeGroupCount: *maxAutoprovisionedNodeGroupCount,
UnremovableNodeRecheckTimeout: *unremovableNodeRecheckTimeout,
ExpendablePodsPriorityCutoff: *expendablePodsPriorityCutoff,
Regional: *regional,
NewPodScaleUpDelay: *newPodScaleUpDelay,
StartupTaints: append(*ignoreTaintsFlag, *startupTaintsFlag...),
StatusTaints: *statusTaintsFlag,
BalancingExtraIgnoredLabels: *balancingIgnoreLabelsFlag,
BalancingLabels: *balancingLabelsFlag,
KubeClientOpts: config.KubeClientOptions{
Master: *kubernetes,
KubeConfigPath: *kubeConfigFile,
Expand Down

0 comments on commit dabc007

Please sign in to comment.