Skip to content

Use accessconfig to determine clusterstate #892

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions pkg/lib/aws/cloudformation.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,9 @@ import (
"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
)

func (c *Client) ListEKSStacks(controlPlaneStack string, nodegroupStacks ...string) ([]*cloudformation.StackSummary, error) {
func (c *Client) ListEKSStacks(controlPlaneStackName string, nodegroupStackNames strset.Set) ([]*cloudformation.StackSummary, error) {
var stackSummaries []*cloudformation.StackSummary
stackSet := strset.New(nodegroupStacks...)
stackSet.Add(controlPlaneStack)
stackSet := strset.Union(nodegroupStackNames, strset.New(controlPlaneStackName))
err := c.CloudFormation().ListStacksPages(
&cloudformation.ListStacksInput{},
func(listStackOutput *cloudformation.ListStacksOutput, lastPage bool) bool {
Expand All @@ -34,7 +33,7 @@ func (c *Client) ListEKSStacks(controlPlaneStack string, nodegroupStacks ...stri
stackSummaries = append(stackSummaries, stackSummary)
}

if *stackSummary.StackName == controlPlaneStack {
if *stackSummary.StackName == controlPlaneStackName {
return false
}
}
Expand Down
99 changes: 60 additions & 39 deletions pkg/types/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package clusterstate

import (
"fmt"
"time"

"github.com/aws/aws-sdk-go/service/cloudformation"
"github.com/cortexlabs/cortex/pkg/lib/aws"
Expand All @@ -42,8 +43,16 @@ type ClusterState struct {
Status Status
}

func any(statuses []string, allowedStatuses ...string) bool {
func is(status string, allowedStatus string, allowedStatuses ...string) bool {
statusSet := strset.New(allowedStatuses...)
statusSet.Add(allowedStatus)

return statusSet.Has(status)
}

func any(statuses []string, allowedStatus string, allowedStatuses ...string) bool {
statusSet := strset.New(allowedStatuses...)
statusSet.Add(allowedStatus)
for _, stackStatus := range statuses {
if statusSet.Has(stackStatus) {
return true
Expand All @@ -53,8 +62,9 @@ func any(statuses []string, allowedStatuses ...string) bool {
return false
}

func all(statuses []string, allowedStatuses ...string) bool {
func all(statuses []string, allowedStatus string, allowedStatuses ...string) bool {
statusSet := strset.New(allowedStatuses...)
statusSet.Add(allowedStatus)
for _, stackStatus := range statuses {
if !statusSet.Has(stackStatus) {
return false
Expand All @@ -76,9 +86,8 @@ func (cs ClusterState) TableString() string {

func getStatus(statusMap map[string]string, controlPlane string) (Status, error) {
// the order matters

allStatuses := []string{}
controlPlaneStatus := []string{statusMap[controlPlane]}
controlPlaneStatus := statusMap[controlPlane]
nodeGroupStatuses := []string{}

for stackName, status := range statusMap {
Expand All @@ -88,6 +97,19 @@ func getStatus(statusMap map[string]string, controlPlane string) (Status, error)
}
}

if any(allStatuses, string(StatusCreateFailedTimedOut)) {
return StatusNotFound, ErrorUnexpectedCloudFormationStatus(s.ObjFlat(statusMap))
}

if len(nodeGroupStatuses) == 0 && controlPlaneStatus == string(StatusNotFound) {
return StatusNotFound, nil
}

// controlplane stack may be created while nodegroup stacks aren't listed in cloudformation stacks during cluster spin up
if len(nodeGroupStatuses) == 0 && is(controlPlaneStatus, cloudformation.StackStatusCreateComplete, cloudformation.StackStatusCreateInProgress) {
return StatusCreateInProgress, nil
}

if any(allStatuses, cloudformation.StackStatusCreateFailed) {
return StatusCreateFailed, nil
}
Expand All @@ -96,8 +118,8 @@ func getStatus(statusMap map[string]string, controlPlane string) (Status, error)
return StatusDeleteFailed, nil
}

if all(allStatuses, string(StatusNotFound)) {
return StatusCreateComplete, nil
if any(allStatuses, cloudformation.StackStatusDeleteInProgress) {
return StatusDeleteInProgress, nil
}

if all(allStatuses, cloudformation.StackStatusCreateComplete) {
Expand All @@ -108,45 +130,54 @@ func getStatus(statusMap map[string]string, controlPlane string) (Status, error)
return StatusDeleteComplete, nil
}

if any(allStatuses, cloudformation.StackStatusDeleteInProgress) {
// nodegroup stacks are deleted first while control plane stack is still in create complete state
if controlPlaneStatus == cloudformation.StackStatusCreateComplete &&
all(nodeGroupStatuses, cloudformation.StackStatusDeleteInProgress, cloudformation.StackStatusDeleteComplete) {
return StatusDeleteInProgress, nil
}

// controlplane stack may be in complete state while nodegroup stacks are still in status not found
if all(controlPlaneStatus, cloudformation.StackStatusCreateComplete, cloudformation.StackStatusCreateInProgress) &&
all(nodeGroupStatuses, cloudformation.StackStatusCreateInProgress, string(StatusNotFound), cloudformation.StackStatusCreateComplete) {
// controlplane stack may be in complete state while nodegroup stacks are still in creating or one nodegroup finishes before the other
if controlPlaneStatus == cloudformation.StackStatusCreateComplete &&
all(nodeGroupStatuses, cloudformation.StackStatusCreateInProgress, cloudformation.StackStatusCreateComplete) {
return StatusCreateInProgress, nil
}

return StatusNotFound, ErrorUnexpectedCloudFormationStatus(s.ObjFlat(statusMap))
}

func GetClusterState(awsClient *aws.Client, clusterConfig *clusterconfig.Config) (*ClusterState, error) {
controlPlaneStackName := fmt.Sprintf(controlPlaneTemplate, clusterConfig.ClusterName)
operatorStackName := fmt.Sprintf(operatorTemplate, clusterConfig.ClusterName)
spotStackName := fmt.Sprintf(spotTemplate, clusterConfig.ClusterName)
onDemandStackName := fmt.Sprintf(onDemandTemplate, clusterConfig.ClusterName)

nodeGroupStackNames := []string{operatorStackName}
if clusterConfig.Spot != nil && *clusterConfig.Spot {
nodeGroupStackNames = append(nodeGroupStackNames, spotStackName)
if clusterConfig.SpotConfig != nil && clusterConfig.SpotConfig.OnDemandBackup != nil && *clusterConfig.SpotConfig.OnDemandBackup {
nodeGroupStackNames = append(nodeGroupStackNames, onDemandStackName)
}
} else {
nodeGroupStackNames = append(nodeGroupStackNames, onDemandStackName)
}
func GetClusterState(awsClient *aws.Client, accessConfig *clusterconfig.AccessConfig) (*ClusterState, error) {
controlPlaneStackName := fmt.Sprintf(controlPlaneTemplate, *accessConfig.ClusterName)
operatorStackName := fmt.Sprintf(operatorTemplate, *accessConfig.ClusterName)
spotStackName := fmt.Sprintf(spotTemplate, *accessConfig.ClusterName)
onDemandStackName := fmt.Sprintf(onDemandTemplate, *accessConfig.ClusterName)

stackSummaries, err := awsClient.ListEKSStacks(controlPlaneStackName, nodeGroupStackNames...)
nodeGroupStackNamesSet := strset.New(operatorStackName, spotStackName, onDemandStackName)

stackSummaries, err := awsClient.ListEKSStacks(controlPlaneStackName, nodeGroupStackNamesSet)
if err != nil {
return nil, errors.Wrap(err, "unable to get cluster state from cloudformation")
}

statusMap := map[string]string{}
statusMap[controlPlaneStackName] = getStatusFromSummaries(stackSummaries, controlPlaneStackName)
nodeGroupStackNames := []string{}
var controlPlaneCreationTime time.Time

for _, stackSummary := range stackSummaries {
statusMap[*stackSummary.StackName] = *stackSummary.StackStatus
if *stackSummary.StackName == controlPlaneStackName {
controlPlaneCreationTime = *stackSummary.CreationTime
} else {
nodeGroupStackNames = append(nodeGroupStackNames, *stackSummary.StackName)
}
}

for _, nodeGroupName := range nodeGroupStackNames {
statusMap[nodeGroupName] = getStatusFromSummaries(stackSummaries, nodeGroupName)
if _, ok := statusMap[controlPlaneStackName]; !ok {
statusMap[controlPlaneStackName] = string(StatusNotFound)
}

// add a timeout for situations where the control plane is listed in the cloudformation stacks but not the nodegroup stacks
if !is(statusMap[controlPlaneStackName], string(StatusNotFound), cloudformation.StackStatusDeleteComplete) && len(nodeGroupStackNames) == 0 && time.Now().After(controlPlaneCreationTime.Add(30*time.Minute)) {
statusMap[operatorStackName] = string(StatusCreateFailedTimedOut)
}

status, err := getStatus(statusMap, controlPlaneStackName)
Expand All @@ -161,13 +192,3 @@ func GetClusterState(awsClient *aws.Client, clusterConfig *clusterconfig.Config)
Status: status,
}, nil
}

func getStatusFromSummaries(stackSummaries []*cloudformation.StackSummary, stackName string) string {
for _, stackSummary := range stackSummaries {
if *stackSummary.StackName == stackName {
return *stackSummary.StackStatus
}
}

return string(StatusNotFound)
}
15 changes: 8 additions & 7 deletions pkg/types/clusterstate/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ package clusterstate
type Status string

const (
StatusNotFound Status = "not_found"
StatusCreateInProgress Status = "create_in_progress"
StatusCreateFailed Status = "create_failed"
StatusCreateComplete Status = "create_complete"
StatusDeleteInProgress Status = "delete_in_progress"
StatusDeleteComplete Status = "delete_complete"
StatusDeleteFailed Status = "delete_failed"
StatusNotFound Status = "not_found"
StatusCreateInProgress Status = "create_in_progress"
StatusCreateFailed Status = "create_failed"
StatusCreateComplete Status = "create_complete"
StatusDeleteInProgress Status = "delete_in_progress"
StatusDeleteComplete Status = "delete_complete"
StatusDeleteFailed Status = "delete_failed"
StatusCreateFailedTimedOut Status = "create_failed_timed_out"
)