Skip to content

Check cluster status before running cluster commands #881

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 175 additions & 54 deletions cli/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/cortexlabs/cortex/pkg/lib/telemetry"
"github.com/cortexlabs/cortex/pkg/operator/schema"
"github.com/cortexlabs/cortex/pkg/types/clusterconfig"
"github.com/cortexlabs/cortex/pkg/types/clusterstate"
"github.com/spf13/cobra"
)

Expand Down Expand Up @@ -89,6 +90,26 @@ var _upCmd = &cobra.Command{
exit.Error(err)
}

accessConfig := clusterConfig.ToAccessConfig()

awsClient, err := newAWSClient(*accessConfig.Region, awsCreds)
if err != nil {
exit.Error(err)
}

clusterState, err := clusterstate.GetClusterState(awsClient, &accessConfig)
if err != nil {
if errors.GetKind(err) == clusterstate.ErrUnexpectedCloudFormationStatus {
fmt.Println(clusterState.TableString())
fmt.Println(fmt.Sprintf("cluster %s in %s is in an unexpected state, please run `cortex cluster down` to delete the cluster or delete the cloudformation stacks manually in your AWS console %s", clusterConfig.ClusterName, *clusterConfig.Region, getCloudFormationURL(*clusterConfig.Region, clusterConfig.ClusterName)))
}
exit.Error(err)
}

err = assertClusterStatus(&accessConfig, clusterState.Status, clusterstate.StatusNotFound, clusterstate.StatusDeleteComplete)
if err != nil {
exit.Error(err)
}
out, exitCode, err := runManagerUpdateCommand("/root/install.sh", clusterConfig, awsCreds)
if err != nil {
exit.Error(err)
Expand Down Expand Up @@ -119,6 +140,30 @@ var _updateCmd = &cobra.Command{
exit.Error(err)
}

accessConfig, err := getClusterAccessConfig()
if err != nil {
exit.Error(err)
}

awsClient, err := newAWSClient(*accessConfig.Region, awsCreds)
if err != nil {
exit.Error(err)
}

clusterState, err := clusterstate.GetClusterState(awsClient, accessConfig)
if err != nil {
if errors.GetKind(err) == clusterstate.ErrUnexpectedCloudFormationStatus {
fmt.Println(clusterState.TableString())
fmt.Println(fmt.Sprintf("cluster %s in %s is in an unexpected state, please run `cortex cluster down` to delete the cluster or delete the cloudformation stacks manually in your AWS console %s", *accessConfig.ClusterName, *accessConfig.Region, getCloudFormationURLWithAccessConfig(accessConfig)))
}
exit.Error(err)
}

err = assertClusterStatus(accessConfig, clusterState.Status, clusterstate.StatusCreateComplete)
if err != nil {
exit.Error(err)
}

cachedClusterConfig := refreshCachedClusterConfig(awsCreds)

clusterConfig, err := getClusterUpdateConfig(cachedClusterConfig, awsCreds)
Expand Down Expand Up @@ -155,64 +200,16 @@ var _infoCmd = &cobra.Command{
exit.Error(err)
}

if _flagDebug {
accessConfig, err := getClusterAccessConfig()
if err != nil {
exit.Error(err)
}

out, exitCode, err := runManagerAccessCommand("/root/debug.sh", *accessConfig, awsCreds)
if err != nil {
exit.Error(err)
}
if exitCode == nil || *exitCode != 0 {
exit.Error(ErrorClusterDebug(out))
}

timestamp := time.Now().UTC().Format("2006-01-02-15-04-05")
userDebugPath := fmt.Sprintf("cortex-debug-%s.tgz", timestamp) // note: if modifying this string, also change it in files.IgnoreCortexDebug()
err = os.Rename(_debugPath, userDebugPath)
if err != nil {
exit.Error(errors.WithStack(err))
}

fmt.Println("saved cluster info to ./" + userDebugPath)
return
}

clusterConfig := refreshCachedClusterConfig(awsCreds)

out, exitCode, err := runManagerAccessCommand("/root/info.sh", clusterConfig.ToAccessConfig(), awsCreds)
accessConfig, err := getClusterAccessConfig()
if err != nil {
exit.Error(err)
}
if exitCode == nil || *exitCode != 0 {
exit.Error(ErrorClusterInfo(out))
}

fmt.Println()

httpResponse, err := HTTPGet("/info")
if err != nil {
fmt.Println(clusterConfig.UserStr())
fmt.Println("\n" + errors.Message(err, "unable to connect to operator"))
return
}

var infoResponse schema.InfoResponse
err = json.Unmarshal(httpResponse, &infoResponse)
if err != nil {
fmt.Println(clusterConfig.UserStr())
fmt.Println("\n" + errors.Message(err, "unable to parse operator response"))
return
if _flagDebug {
cmdDebug(awsCreds, accessConfig)
} else {
cmdInfo(awsCreds, accessConfig)
}
infoResponse.ClusterConfig.Config = clusterConfig

var items table.KeyValuePairs
items.Add("aws access key id", infoResponse.MaskedAWSAccessKeyID)
items.AddAll(infoResponse.ClusterConfig.UserTable())

items.Print()
},
}

Expand Down Expand Up @@ -244,14 +241,30 @@ var _downCmd = &cobra.Command{
}
warnIfNotAdmin(awsClient)

clusterState, err := clusterstate.GetClusterState(awsClient, accessConfig)
if err != nil {
if errors.GetKind(err) == clusterstate.ErrUnexpectedCloudFormationStatus {
fmt.Println(clusterState.TableString())
fmt.Println(fmt.Sprintf("cluster %s in %s is in an unexpected state, please delete the cloudformation stacks manually in your AWS console %s", *accessConfig.ClusterName, *accessConfig.Region, getCloudFormationURLWithAccessConfig(accessConfig)))
}
exit.Error(err)
}

switch clusterState.Status {
case clusterstate.StatusNotFound:
exit.Error(ErrorClusterDoesNotExist(*accessConfig.ClusterName, *accessConfig.Region))
case clusterstate.StatusDeleteComplete:
exit.Error(ErrorClusterAlreadyDeleted(*accessConfig.ClusterName, *accessConfig.Region))
}

prompt.YesOrExit(fmt.Sprintf("your cluster (%s in %s) will be spun down and all apis will be deleted, are you sure you want to continue?", *accessConfig.ClusterName, *accessConfig.Region), "", "")

out, exitCode, err := runManagerAccessCommand("/root/uninstall.sh", *accessConfig, awsCreds)
if err != nil {
exit.Error(err)
}
if exitCode == nil || *exitCode != 0 {
helpStr := fmt.Sprintf("\nNote: if this error cannot be resolved, please ensure that all CloudFormation stacks for this cluster eventually become been fully deleted (https://console.aws.amazon.com/cloudformation/home?region=%s#/stacks?filteringText=-%s-). If the stack deletion process has failed, please manually delete the stack from the AWS console (this may require manually deleting particular AWS resources that are blocking the stack deletion)", *accessConfig.Region, *accessConfig.ClusterName)
helpStr := fmt.Sprintf("\nNote: if this error cannot be resolved, please ensure that all CloudFormation stacks for this cluster eventually become been fully deleted (%s). If the stack deletion process has failed, please manually delete the stack from the AWS console (this may require manually deleting particular AWS resources that are blocking the stack deletion)", getCloudFormationURLWithAccessConfig(accessConfig))
fmt.Println(helpStr)
exit.Error(ErrorClusterDown(out + helpStr))
}
Expand Down Expand Up @@ -304,6 +317,87 @@ func promptForEmail() {
}
}

func cmdInfo(awsCreds AWSCredentials, accessConfig *clusterconfig.AccessConfig) {
awsClient, err := newAWSClient(*accessConfig.Region, awsCreds)
if err != nil {
exit.Error(err)
}

clusterState, err := clusterstate.GetClusterState(awsClient, accessConfig)
if err != nil {
if errors.GetKind(err) == clusterstate.ErrUnexpectedCloudFormationStatus {
fmt.Println(clusterState.TableString())
fmt.Println(fmt.Sprintf("cluster %s in %s is in an unexpected state, please run `cortex cluster down` to delete the cluster or delete the cloudformation stacks manually in your AWS console %s", *accessConfig.ClusterName, *accessConfig.Region, getCloudFormationURLWithAccessConfig(accessConfig)))
}
exit.Error(err)
}

fmt.Println(clusterState.TableString())
if clusterState.Status == clusterstate.StatusCreateFailed || clusterState.Status == clusterstate.StatusDeleteFailed {
fmt.Println(fmt.Sprintf("More information can be found in your AWS console %s", getCloudFormationURLWithAccessConfig(accessConfig)))
fmt.Println()
}

err = assertClusterStatus(accessConfig, clusterState.Status, clusterstate.StatusCreateComplete)
if err != nil {
exit.Error(err)
}

clusterConfig := refreshCachedClusterConfig(awsCreds)

out, exitCode, err := runManagerAccessCommand("/root/info.sh", *accessConfig, awsCreds)
if err != nil {
exit.Error(err)
}
if exitCode == nil || *exitCode != 0 {
exit.Error(ErrorClusterInfo(out))
}

fmt.Println()

httpResponse, err := HTTPGet("/info")
if err != nil {
fmt.Println(clusterConfig.UserStr())
fmt.Println("\n" + errors.Message(err, "unable to connect to operator"))
return
}

var infoResponse schema.InfoResponse
err = json.Unmarshal(httpResponse, &infoResponse)
if err != nil {
fmt.Println(clusterConfig.UserStr())
fmt.Println("\n" + errors.Message(err, "unable to parse operator response"))
return
}
infoResponse.ClusterConfig.Config = clusterConfig

var items table.KeyValuePairs
items.Add("aws access key id", infoResponse.MaskedAWSAccessKeyID)
items.AddAll(infoResponse.ClusterConfig.UserTable())

items.Print()
}

func cmdDebug(awsCreds AWSCredentials, accessConfig *clusterconfig.AccessConfig) {
out, exitCode, err := runManagerAccessCommand("/root/debug.sh", *accessConfig, awsCreds)
if err != nil {
exit.Error(err)
}
if exitCode == nil || *exitCode != 0 {
exit.Error(ErrorClusterDebug(out))
}

timestamp := time.Now().UTC().Format("2006-01-02-15-04-05")
userDebugPath := fmt.Sprintf("cortex-debug-%s.tgz", timestamp) // note: if modifying this string, also change it in files.IgnoreCortexDebug()
err = os.Rename(_debugPath, userDebugPath)
if err != nil {
exit.Error(errors.WithStack(err))
}

fmt.Println("saved cluster info to ./" + userDebugPath)
return
}

func refreshCachedClusterConfig(awsCreds AWSCredentials) clusterconfig.Config {
accessConfig, err := getClusterAccessConfig()
if err != nil {
Expand Down Expand Up @@ -331,3 +425,30 @@ func refreshCachedClusterConfig(awsCreds AWSCredentials) clusterconfig.Config {
readCachedClusterConfigFile(refreshedClusterConfig, cachedConfigPath)
return *refreshedClusterConfig
}

func assertClusterStatus(accessConfig *clusterconfig.AccessConfig, status clusterstate.Status, allowedStatuses ...clusterstate.Status) error {
for _, allowedStatus := range allowedStatuses {
if status == allowedStatus {
return nil
}
}

switch status {
case clusterstate.StatusCreateInProgress:
return ErrorClusterUpInProgress(*accessConfig.ClusterName, *accessConfig.Region)
case clusterstate.StatusCreateComplete:
return ErrorClusterAlreadyCreated(*accessConfig.ClusterName, *accessConfig.Region)
case clusterstate.StatusDeleteInProgress:
return ErrorClusterDownInProgress(*accessConfig.ClusterName, *accessConfig.Region)
case clusterstate.StatusNotFound:
return ErrorClusterDoesNotExist(*accessConfig.ClusterName, *accessConfig.Region)
case clusterstate.StatusDeleteComplete:
return ErrorClusterAlreadyDeleted(*accessConfig.ClusterName, *accessConfig.Region)
default:
return ErrorFailedClusterStatus(status, *accessConfig.ClusterName, *accessConfig.Region)
}
}

func getCloudFormationURLWithAccessConfig(accessConfig *clusterconfig.AccessConfig) string {
return getCloudFormationURL(*accessConfig.ClusterName, *accessConfig.Region)
}
53 changes: 53 additions & 0 deletions cli/cmd/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/cortexlabs/cortex/pkg/lib/errors"
s "github.com/cortexlabs/cortex/pkg/lib/strings"
"github.com/cortexlabs/cortex/pkg/lib/urls"
"github.com/cortexlabs/cortex/pkg/types/clusterstate"
)

const (
Expand All @@ -34,6 +35,10 @@ func errStrFailedToConnect(u url.URL) string {
return "failed to connect to " + urls.TrimQueryParamsURL(u)
}

func getCloudFormationURL(clusterName, region string) string {
return fmt.Sprintf("https://console.aws.amazon.com/cloudformation/home?region=%s#/stacks?filteringText=-%s-", region, clusterName)
}

const (
ErrCLINotConfigured = "cli.cli_not_configured"
ErrCortexYAMLNotFound = "cli.cortex_yaml_not_found"
Expand All @@ -55,6 +60,12 @@ const (
ErrClusterDown = "cli.cluster_down"
ErrDuplicateCLIEnvNames = "cli.duplicate_cli_env_names"
ErrInvalidOperatorEndpoint = "cli.invalid_operator_endpoint"
ErrClusterUpInProgress = "cli.cluster_up_in_progress"
ErrClusterAlreadyCreated = "cli.cluster_already_created"
ErrClusterDownInProgress = "cli.cluster_down_in_progress"
ErrClusterAlreadyDeleted = "cli.cluster_already_deleted"
ErrFailedClusterStatus = "cli.failed_cluster_status"
ErrClusterDoesNotExist = "cli.cluster_does_not_exist"
)

func ErrorCLINotConfigured(env string) error {
Expand Down Expand Up @@ -219,3 +230,45 @@ func ErrorInvalidOperatorEndpoint(endpoint string) error {
Message: fmt.Sprintf("%s is not a cortex operator endpoint; run `cortex cluster info` to show your operator endpoint or run `cortex cluster up` to spin up a new cluster", endpoint),
})
}

func ErrorClusterDoesNotExist(clusterName string, region string) error {
return errors.WithStack(&errors.Error{
Kind: ErrClusterDoesNotExist,
Message: fmt.Sprintf("cluster %s in %s does not exist", clusterName, region),
})
}

func ErrorClusterUpInProgress(clusterName string, region string) error {
return errors.WithStack(&errors.Error{
Kind: ErrClusterUpInProgress,
Message: fmt.Sprintf("creation of cluster %s in %s is currently in progress", clusterName, region),
})
}

func ErrorClusterAlreadyCreated(clusterName string, region string) error {
return errors.WithStack(&errors.Error{
Kind: ErrClusterAlreadyCreated,
Message: fmt.Sprintf("cluster %s in %s has already been created", clusterName, region),
})
}

func ErrorClusterDownInProgress(clusterName string, region string) error {
return errors.WithStack(&errors.Error{
Kind: ErrClusterDownInProgress,
Message: fmt.Sprintf("deletion of cluster %s in %s is currently in progress", clusterName, region),
})
}

func ErrorClusterAlreadyDeleted(clusterName string, region string) error {
return errors.WithStack(&errors.Error{
Kind: ErrClusterAlreadyDeleted,
Message: fmt.Sprintf("cluster %s in %s has already been deleted or does not exist", clusterName, region),
})
}

func ErrorFailedClusterStatus(status clusterstate.Status, clusterName string, region string) error {
return errors.WithStack(&errors.Error{
Kind: ErrFailedClusterStatus,
Message: fmt.Sprintf("cluster %s in %s encountered an unexpected status %s, please try to delete the cluster with `cortex cluster down` or delete the cloudformation stacks manually in your AWS console %s", clusterName, region, string(status), getCloudFormationURL(clusterName, region)),
})
}
Loading