Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cmd to configure nodegroups on a running cluster #2246

Merged
merged 41 commits into from
Jun 17, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
3b28bbd
WIP on nodegroup adder
RobertLucian Jun 7, 2021
6fa2e8f
WIP nodegroup adder cmd
RobertLucian Jun 8, 2021
d4d2bda
Merge branch 'master' into feature/add-or-remove-ngs
RobertLucian Jun 8, 2021
ec88c2d
Use the simplified aws resource table when showing the costs
RobertLucian Jun 8, 2021
3083606
WIP cluster configure
RobertLucian Jun 9, 2021
ddd5b3e
WIP cluster configure
RobertLucian Jun 10, 2021
9de468a
Bug fixes
RobertLucian Jun 10, 2021
bc3f33a
Further fixes on the cloudformation stacks
RobertLucian Jun 10, 2021
914233b
Address layout on install.sh
RobertLucian Jun 10, 2021
3fcc248
Add priority field to the node group config
RobertLucian Jun 10, 2021
e8b7bf6
Document the priority field in the docs
RobertLucian Jun 10, 2021
19e4be5
Make lint
RobertLucian Jun 10, 2021
cadadfa
Layout change for cluster configure cmd
RobertLucian Jun 11, 2021
20412af
Better reconciliation w/ cloudformation stacks
RobertLucian Jun 11, 2021
a38ec20
Fix number of SGs when cluster already exists
RobertLucian Jun 11, 2021
299fb39
Quota fixes
RobertLucian Jun 11, 2021
e2949c2
Further fixes
RobertLucian Jun 11, 2021
2b445b2
Improve cluster info cmd
RobertLucian Jun 11, 2021
1b599a2
Remove debugging comments
RobertLucian Jun 11, 2021
33c6d36
Nits
RobertLucian Jun 11, 2021
d7d394b
Remove the nodegroups first and then add the others
RobertLucian Jun 11, 2021
df02113
Merge branch 'master' into feature/add-or-remove-ngs
RobertLucian Jun 11, 2021
2c50eb4
Nits
RobertLucian Jun 11, 2021
d6fd7b8
Separate validate functions
RobertLucian Jun 14, 2021
5f385cc
Simplify get cluster state package
RobertLucian Jun 14, 2021
97e4f30
Address PR comments
RobertLucian Jun 14, 2021
4153321
Merge branch 'master' into feature/add-or-remove-ngs
RobertLucian Jun 14, 2021
1c42ae0
Add missing error print when stacks couldn't be retrieved
RobertLucian Jun 14, 2021
157fec7
Bolts and fixes
RobertLucian Jun 14, 2021
c562386
Address PR comments
RobertLucian Jun 15, 2021
fb53dda
Print cluster stacks when running cluster info cmd
RobertLucian Jun 15, 2021
eae2bd1
Refactor
RobertLucian Jun 15, 2021
262b4f5
Some refactoring
RobertLucian Jun 15, 2021
b0fc893
Merge branch 'master' into feature/add-or-remove-ngs
RobertLucian Jun 15, 2021
0068a7d
Fix to the number of required SGs on configure
RobertLucian Jun 15, 2021
3bc9d13
Merge branch 'master' into feature/add-or-remove-ngs
RobertLucian Jun 16, 2021
91bf390
Address PR comments
RobertLucian Jun 16, 2021
d4b5bb9
Address merge conflicts from master
RobertLucian Jun 16, 2021
b58b7a6
Merge branch 'master' into feature/add-or-remove-ngs
RobertLucian Jun 17, 2021
4d5e378
Addressing PR comments and a fix
RobertLucian Jun 17, 2021
4d40100
Merge branch 'master' into feature/add-or-remove-ngs
RobertLucian Jun 17, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactor
  • Loading branch information
RobertLucian committed Jun 15, 2021
commit eae2bd1c49b4f6cfb63045b0d24d0cb21b801a28
2 changes: 1 addition & 1 deletion cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ var _clusterConfigureCmd = &cobra.Command{
out, exitCode, err := runManagerWithClusterConfig("/root/install.sh --configure", newClusterConfig, awsClient, nil, nil, []string{
"CORTEX_NODEGROUP_NAMES_TO_SCALE=" + strings.Join(configureChanges.NodeGroupsToScale, " "), // NodeGroupsToScale contain the cluster config node-group names
"CORTEX_NODEGROUP_NAMES_TO_ADD=" + strings.Join(configureChanges.NodeGroupsToAdd, " "), // NodeGroupsToAdd contain the cluster config node-group names
"CORTEX_NODEGROUP_NAMES_TO_REMOVE=" + strings.Join(configureChanges.NodeGroupsToRemove, " "), // NodeGroupsToRemove contain the EKS node-group names
"CORTEX_NODEGROUP_NAMES_TO_REMOVE=" + strings.Join(configureChanges.StaleEKSNodeGroups, " "), // StaleEKSNodeGroups contain the EKS node-group names
RobertLucian marked this conversation as resolved.
Show resolved Hide resolved
})
if err != nil {
exit.Error(err)
Expand Down
13 changes: 1 addition & 12 deletions cli/cmd/lib_cluster_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ import (
libmath "github.com/cortexlabs/cortex/pkg/lib/math"
"github.com/cortexlabs/cortex/pkg/lib/pointer"
"github.com/cortexlabs/cortex/pkg/lib/prompt"
"github.com/cortexlabs/cortex/pkg/lib/slices"
s "github.com/cortexlabs/cortex/pkg/lib/strings"
"github.com/cortexlabs/cortex/pkg/lib/table"
"github.com/cortexlabs/cortex/pkg/types/clusterconfig"
Expand Down Expand Up @@ -158,22 +157,12 @@ func getConfigureClusterConfig(awsClient *aws.Client, stacks clusterstate.Cluste
newUserClusterConfig.Telemetry = isTelemetryEnabled()
cachedClusterConfig.Telemetry = newUserClusterConfig.Telemetry

configureChanges, err := newUserClusterConfig.ValidateOnConfigure(awsClient, cachedClusterConfig)
configureChanges, err := newUserClusterConfig.ValidateOnConfigure(awsClient, cachedClusterConfig, stacks.NodeGroupsStacks)
if err != nil {
err = errors.Append(err, fmt.Sprintf("\n\ncluster configuration schema can be found at https://docs.cortex.dev/v/%s/", consts.CortexVersionMinor))
return nil, clusterconfig.ConfigureChanges{}, errors.Wrap(err, newClusterConfigFile)
}

// intersect with the stale eks node groups
eksNodeGroupsToRemove := []string{}
staleEKSNgs, staleEKSNgAvailabilities := stacks.GetStaleNodeGroupNames(*newUserClusterConfig)
for i := range staleEKSNgs {
if slices.HasString(configureChanges.NodeGroupsToRemove, staleEKSNgs[i]) {
eksNodeGroupsToRemove = append(eksNodeGroupsToRemove, clusterstate.GetStackName(newUserClusterConfig.ClusterName, staleEKSNgAvailabilities[i], staleEKSNgs[i]))
}
}
configureChanges.NodeGroupsToRemove = eksNodeGroupsToRemove

confirmConfigureClusterConfig(configureChanges, cachedClusterConfig, *newUserClusterConfig, _flagClusterDisallowPrompt)

return newUserClusterConfig, configureChanges, nil
Expand Down
31 changes: 29 additions & 2 deletions pkg/types/clusterconfig/cluster_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"time"

"github.com/aws/amazon-vpc-cni-k8s/pkg/awsutils"
"github.com/aws/aws-sdk-go/service/cloudformation"
"github.com/aws/aws-sdk-go/service/iam"
"github.com/cortexlabs/cortex/pkg/consts"
"github.com/cortexlabs/cortex/pkg/lib/aws"
Expand Down Expand Up @@ -190,10 +191,11 @@ type ConfigureChanges struct {
NodeGroupsToAdd []string
NodeGroupsToRemove []string
NodeGroupsToScale []string
StaleEKSNodeGroups []string
RobertLucian marked this conversation as resolved.
Show resolved Hide resolved
}

func (c *ConfigureChanges) HasChanges() bool {
return len(c.NodeGroupsToAdd) != 0 || len(c.NodeGroupsToRemove) != 0 || len(c.NodeGroupsToScale) != 0
return len(c.NodeGroupsToAdd) != 0 || len(c.NodeGroupsToRemove) != 0 || len(c.NodeGroupsToScale) != 0 || len(c.StaleEKSNodeGroups) != 0
}

// NewForFile initializes and validates the cluster config from the YAML config file
Expand Down Expand Up @@ -1101,7 +1103,7 @@ func (cc *Config) ValidateOnInstall(awsClient *aws.Client) error {
return nil
}

func (cc *Config) ValidateOnConfigure(awsClient *aws.Client, oldConfig Config) (ConfigureChanges, error) {
func (cc *Config) ValidateOnConfigure(awsClient *aws.Client, oldConfig Config, eksNodeGroupStacks []*cloudformation.StackSummary) (ConfigureChanges, error) {
fmt.Print("verifying your configuration ...\n\n")

cc.ClusterUID = oldConfig.ClusterUID
Expand Down Expand Up @@ -1144,6 +1146,7 @@ func (cc *Config) ValidateOnConfigure(awsClient *aws.Client, oldConfig Config) (
NodeGroupsToAdd: GetNodeGroupNames(ngsToBeAdded),
NodeGroupsToRemove: GetNodeGroupNames(ngsToBeRemoved),
NodeGroupsToScale: GetNodeGroupNames(ngNamesToBeScaled),
StaleEKSNodeGroups: getStaleEksNodeGroups(cc.ClusterName, eksNodeGroupStacks, ngsToBeRemoved),
}, nil
}

Expand Down Expand Up @@ -1523,6 +1526,30 @@ func (ng *NodeGroup) SpotConfigOnDemandValues() (int64, int64) {
return onDemandBaseCapacity, onDemandPercentageAboveBaseCapacity
}

func getStaleEksNodeGroups(clusterName string, eksNodeGroupStacks []*cloudformation.StackSummary, ngsMarkedForRemoval []*NodeGroup) []string {
eksNodeGroupsToRemove := []string{}
for _, ng := range ngsMarkedForRemoval {
availability := "d"
RobertLucian marked this conversation as resolved.
Show resolved Hide resolved
if ng.Spot {
availability = "s"
}

eksNgName := fmt.Sprintf("cx-w%s-%s", availability, ng.Name)
eksStackName := fmt.Sprintf("eksctl-%s-nodegroup-cx-w%s-%s", clusterName, availability, ng.Name)
for _, eksNgStack := range eksNodeGroupStacks {
if eksNgStack == nil || eksNgStack.StackName == nil {
continue
}
if *eksNgStack.StackName == eksStackName {
eksNodeGroupsToRemove = append(eksNodeGroupsToRemove, eksNgName)
break
}
}
}

return eksNodeGroupsToRemove
}

func (cc *CoreConfig) TelemetryEvent() map[string]interface{} {
event := make(map[string]interface{})

Expand Down