Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to prefer scheduling on stable clusters #1052

Merged
merged 1 commit into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/cloud/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ func executeServerCmd(flags serverFlags) error {
// TODO: move these cluster threshold values to cluster configuration.
installationScheduling := supervisor.NewInstallationSupervisorSchedulingOptions(
flags.balancedInstallationScheduling,
flags.preferScheduleOnStableClusters,
flags.clusterResourceThreshold,
flags.thresholdCPUOverride,
flags.thresholdMemoryOverride,
Expand Down
2 changes: 2 additions & 0 deletions cmd/cloud/server_flag.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ func (flags *supervisorOptions) addFlags(command *cobra.Command) {

type schedulingOptions struct {
balancedInstallationScheduling bool
preferScheduleOnStableClusters bool
clusterResourceThresholdScaleValue int
clusterResourceThreshold int
thresholdCPUOverride int
Expand All @@ -60,6 +61,7 @@ type schedulingOptions struct {

func (flags *schedulingOptions) addFlags(command *cobra.Command) {
command.Flags().BoolVar(&flags.balancedInstallationScheduling, "balanced-installation-scheduling", true, "Whether to schedule installations on the cluster with the greatest percentage of available resources or not. (slows down scheduling speed as cluster count increases)")
command.Flags().BoolVar(&flags.preferScheduleOnStableClusters, "prefer-stable-cluster-installation-scheduling", false, "Whether to prioritize scheduling installations on the clusters in the stable state or not. (can slow scheduling speed as cluster count increases)")
command.Flags().IntVar(&flags.clusterResourceThresholdScaleValue, "cluster-resource-threshold-scale-value", 0, "The number of worker nodes to scale up by when the threshold is passed. Set to 0 for no scaling. Scaling will never exceed the cluster max worker configuration value.")
command.Flags().IntVar(&flags.clusterResourceThreshold, "cluster-resource-threshold", 80, "The percent threshold where new installations won't be scheduled on a multi-tenant cluster.")
command.Flags().IntVar(&flags.thresholdCPUOverride, "cluster-resource-threshold-cpu-override", 0, "The cluster-resource-threshold override value for CPU resources only")
Expand Down
38 changes: 34 additions & 4 deletions internal/supervisor/installation.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ type InstallationSupervisorCache struct {
// how installation scheduling occurs.
type InstallationSupervisorSchedulingOptions struct {
BalanceInstallations bool
PreferScheduleOnStableClusters bool
ClusterResourceThresholdCPU int
ClusterResourceThresholdMemory int
ClusterResourceThresholdPodCount int
Expand Down Expand Up @@ -168,9 +169,10 @@ func NewInstallationSupervisor(
}

// NewInstallationSupervisorSchedulingOptions creates a new InstallationSupervisorSchedulingOptions.
func NewInstallationSupervisorSchedulingOptions(balanceInstallations bool, clusterResourceThreshold, thresholdCPUOverride, thresholdMemoryOverride, thresholdPodCountOverride, clusterResourceThresholdScaleValue int) InstallationSupervisorSchedulingOptions {
func NewInstallationSupervisorSchedulingOptions(balanceInstallations, preferStableClusters bool, clusterResourceThreshold, thresholdCPUOverride, thresholdMemoryOverride, thresholdPodCountOverride, clusterResourceThresholdScaleValue int) InstallationSupervisorSchedulingOptions {
schedulingOptions := InstallationSupervisorSchedulingOptions{
BalanceInstallations: balanceInstallations,
PreferScheduleOnStableClusters: preferStableClusters,
ClusterResourceThresholdCPU: clusterResourceThreshold,
ClusterResourceThresholdMemory: clusterResourceThreshold,
ClusterResourceThresholdPodCount: clusterResourceThreshold,
Expand Down Expand Up @@ -438,7 +440,12 @@ func (s *InstallationSupervisor) createInstallation(installation *model.Installa

if s.scheduling.BalanceInstallations {
logger.Info("Attempting to schedule installation on the lowest-utilized cluster")
clusters = s.prioritizeLowerUtilizedClusters(clusters, installation, instanceID, logger)
clusters = s.prioritizeLowerUtilizedClusters(clusters, installation, logger)
}

if s.scheduling.PreferScheduleOnStableClusters {
logger.Info("Attempting to schedule installation on a cluster in the stable state")
clusters = PrioritizeStableStateClusters(clusters)
}

for _, cluster := range clusters {
Expand Down Expand Up @@ -466,7 +473,7 @@ func (s *InstallationSupervisor) createInstallation(installation *model.Installa
// list will be the lowest at the time it was checked.
// - When scheduling an installation, all of the standard scheduling checks
// should be performed again under cluster lock.
func (s *InstallationSupervisor) prioritizeLowerUtilizedClusters(clusters []*model.Cluster, installation *model.Installation, instanceID string, logger log.FieldLogger) []*model.Cluster {
func (s *InstallationSupervisor) prioritizeLowerUtilizedClusters(clusters []*model.Cluster, installation *model.Installation, logger log.FieldLogger) []*model.Cluster {
lowestResourcePercent := 10000
var filteredPrioritizedClusters []*model.Cluster

Expand Down Expand Up @@ -517,6 +524,25 @@ func (s *InstallationSupervisor) prioritizeLowerUtilizedClusters(clusters []*mod
return filteredPrioritizedClusters
}

// PrioritizeStableStateClusters will sort the cluster list prioritizing
// clusters in the stable state.
func PrioritizeStableStateClusters(clusters []*model.Cluster) []*model.Cluster {
// Build a new prioritized list of clusters. The cluster list may already be
// sorted by resource usage so try to preserve that as well.
var stableClusters []*model.Cluster
var unstableClusters []*model.Cluster

for _, cluster := range clusters {
if cluster.State == model.ClusterStateStable {
stableClusters = append(stableClusters, cluster)
} else {
unstableClusters = append(unstableClusters, cluster)
}
}

return append(stableClusters, unstableClusters...)
}

// getClusterResources returns cluster resources from cache or will obtain them
// directly if they don't exist.
func (s *InstallationSupervisor) getClusterResources(cluster *model.Cluster, logger log.FieldLogger) (*k8s.ClusterResources, error) {
Expand Down Expand Up @@ -660,7 +686,11 @@ func (s *InstallationSupervisor) createClusterInstallation(cluster *model.Cluste
logger.WithError(err).Error("Failed to create cluster installation state change event")
}

logger.Infof("Requested creation of cluster installation on cluster %s. Expected resource load: CPU=%d%%, Memory=%d%%, PodCount=%d%%", cluster.ID, cpuPercent, memoryPercent, podPercent)
logger.Infof(
"Requested creation of cluster installation on cluster %s (state=%s). Expected resource load: CPU=%d%%, Memory=%d%%, PodCount=%d%%",
cluster.ID, cluster.State,
cpuPercent, memoryPercent, podPercent,
)

return clusterInstallation
}
Expand Down
106 changes: 93 additions & 13 deletions internal/supervisor/installation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ func (m *mockCloudflareClient) DeleteDNSRecords(customerDNSName []string, logger
}

func TestInstallationSupervisorDo(t *testing.T) {
standardSchedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(false, 80, 0, 0, 0, 0)
standardSchedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(false, false, 80, 0, 0, 0, 0)
require.NoError(t, standardSchedulingOptions.Validate())

t.Run("no installations pending work", func(t *testing.T) {
Expand Down Expand Up @@ -729,7 +729,7 @@ func TestInstallationSupervisorDo(t *testing.T) {
}

func TestInstallationSupervisor(t *testing.T) {
standardSchedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(false, 80, 0, 0, 0, 0)
standardSchedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(false, false, 80, 0, 0, 0, 0)
require.NoError(t, standardSchedulingOptions.Validate())

expectInstallationState := func(t *testing.T, sqlStore *store.SQLStore, installation *model.Installation, expectedState string) {
Expand Down Expand Up @@ -2491,7 +2491,7 @@ func TestInstallationSupervisor(t *testing.T) {
UsedPodCount: 100,
},
}
schedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(false, 80, 0, 0, 0, 2)
schedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(false, false, 80, 0, 0, 0, 2)
require.NoError(t, schedulingOptions.Validate())
supervisor := supervisor.NewInstallationSupervisor(
sqlStore,
Expand Down Expand Up @@ -2538,7 +2538,7 @@ func TestInstallationSupervisor(t *testing.T) {
sqlStore := store.MakeTestSQLStore(t, logger)
defer store.CloseConnection(t, sqlStore)

schedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(true, 80, 0, 0, 0, 0)
schedulingOptions := supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 0, 0, 0)
require.NoError(t, schedulingOptions.Validate())
supervisor := supervisor.NewInstallationSupervisor(
sqlStore,
Expand Down Expand Up @@ -2722,7 +2722,7 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
}{
{
name: "valid, no overrides",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, 80, 0, 0, 0, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 0, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
ClusterResourceThresholdCPU: 80,
Expand All @@ -2734,7 +2734,7 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "valid, cpu override",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, 80, 40, 0, 0, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 40, 0, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
ClusterResourceThresholdCPU: 40,
Expand All @@ -2746,7 +2746,7 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "valid, memory override",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, 80, 0, 40, 0, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 40, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
ClusterResourceThresholdCPU: 80,
Expand All @@ -2758,7 +2758,7 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "valid, pod count override",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, 80, 0, 0, 40, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 0, 40, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
ClusterResourceThresholdCPU: 80,
Expand All @@ -2770,7 +2770,7 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "invalid, no overrides",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, -1, 0, 0, 0, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, -1, 0, 0, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
ClusterResourceThresholdCPU: -1,
Expand All @@ -2782,7 +2782,7 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "invalid, cpu override",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, 80, 2, 0, 0, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 2, 0, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
ClusterResourceThresholdCPU: 2,
Expand All @@ -2794,7 +2794,7 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "invalid, memory override",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, 80, 0, 2, 0, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 2, 0, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
ClusterResourceThresholdCPU: 80,
Expand All @@ -2806,7 +2806,7 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "invalid, pod count override",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, 80, 0, 0, 2, 2),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 0, 2, 2),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
ClusterResourceThresholdCPU: 80,
Expand All @@ -2818,7 +2818,7 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
},
{
name: "invalid, scale value out of bounds",
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, 80, 0, 0, 0, -1),
inputOptions: supervisor.NewInstallationSupervisorSchedulingOptions(true, false, 80, 0, 0, 0, -1),
expectedOptions: supervisor.InstallationSupervisorSchedulingOptions{
BalanceInstallations: true,
ClusterResourceThresholdCPU: 80,
Expand All @@ -2840,3 +2840,83 @@ func TestInstallationSupervisorSchedulingOptions(t *testing.T) {
})
}
}
func TestPrioritizeStableStateClusters(t *testing.T) {
for _, testCase := range []struct {
name string
inputClusters []*model.Cluster
expectedClusters []*model.Cluster
}{
{
name: "one stable cluster",
inputClusters: []*model.Cluster{
{ID: "id1", State: model.ClusterStateStable},
},
expectedClusters: []*model.Cluster{
{ID: "id1", State: model.ClusterStateStable},
},
},
{
name: "one provisioning cluster",
inputClusters: []*model.Cluster{
{ID: "id1", State: model.ClusterStateProvisionInProgress},
},
expectedClusters: []*model.Cluster{
{ID: "id1", State: model.ClusterStateProvisionInProgress},
},
},
{
name: "two stable clusters",
inputClusters: []*model.Cluster{
{ID: "id1", State: model.ClusterStateStable},
{ID: "id2", State: model.ClusterStateStable},
},
expectedClusters: []*model.Cluster{
{ID: "id1", State: model.ClusterStateStable},
{ID: "id2", State: model.ClusterStateStable},
},
},
{
name: "one stable, one provisioning",
inputClusters: []*model.Cluster{
{ID: "id1", State: model.ClusterStateStable},
{ID: "id2", State: model.ClusterStateProvisionInProgress},
},
expectedClusters: []*model.Cluster{
{ID: "id1", State: model.ClusterStateStable},
{ID: "id2", State: model.ClusterStateProvisionInProgress},
},
},
{
name: "one provisioning, one stable",
inputClusters: []*model.Cluster{
{ID: "id1", State: model.ClusterStateProvisionInProgress},
{ID: "id2", State: model.ClusterStateStable},
},
expectedClusters: []*model.Cluster{
{ID: "id2", State: model.ClusterStateStable},
{ID: "id1", State: model.ClusterStateProvisionInProgress},
},
},
{
name: "complex",
inputClusters: []*model.Cluster{
{ID: "id1", State: model.ClusterStateProvisionInProgress},
{ID: "id2", State: model.ClusterStateStable},
{ID: "id3", State: model.ClusterStateStable},
{ID: "id4", State: model.ClusterStateResizeRequested},
{ID: "id5", State: model.ClusterStateStable},
},
expectedClusters: []*model.Cluster{
{ID: "id2", State: model.ClusterStateStable},
{ID: "id3", State: model.ClusterStateStable},
{ID: "id5", State: model.ClusterStateStable},
{ID: "id1", State: model.ClusterStateProvisionInProgress},
{ID: "id4", State: model.ClusterStateResizeRequested},
},
},
} {
t.Run(testCase.name, func(t *testing.T) {
assert.Equal(t, testCase.expectedClusters, supervisor.PrioritizeStableStateClusters(testCase.inputClusters))
})
}
}