Skip to content

Commit 2873e64

Browse files
authored
Check cluster status before running cluster commands (#881)
1 parent 48a5656 commit 2873e64

File tree

3 files changed

+253
-58
lines changed

3 files changed

+253
-58
lines changed

cli/cmd/cluster.go

Lines changed: 175 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"github.com/cortexlabs/cortex/pkg/lib/telemetry"
3434
"github.com/cortexlabs/cortex/pkg/operator/schema"
3535
"github.com/cortexlabs/cortex/pkg/types/clusterconfig"
36+
"github.com/cortexlabs/cortex/pkg/types/clusterstate"
3637
"github.com/spf13/cobra"
3738
)
3839

@@ -89,6 +90,26 @@ var _upCmd = &cobra.Command{
8990
exit.Error(err)
9091
}
9192

93+
accessConfig := clusterConfig.ToAccessConfig()
94+
95+
awsClient, err := newAWSClient(*accessConfig.Region, awsCreds)
96+
if err != nil {
97+
exit.Error(err)
98+
}
99+
100+
clusterState, err := clusterstate.GetClusterState(awsClient, &accessConfig)
101+
if err != nil {
102+
if errors.GetKind(err) == clusterstate.ErrUnexpectedCloudFormationStatus {
103+
fmt.Println(clusterState.TableString())
104+
fmt.Println(fmt.Sprintf("cluster %s in %s is in an unexpected state, please run `cortex cluster down` to delete the cluster or delete the cloudformation stacks manually in your AWS console %s", clusterConfig.ClusterName, *clusterConfig.Region, getCloudFormationURL(*clusterConfig.Region, clusterConfig.ClusterName)))
105+
}
106+
exit.Error(err)
107+
}
108+
109+
err = assertClusterStatus(&accessConfig, clusterState.Status, clusterstate.StatusNotFound, clusterstate.StatusDeleteComplete)
110+
if err != nil {
111+
exit.Error(err)
112+
}
92113
out, exitCode, err := runManagerUpdateCommand("/root/install.sh", clusterConfig, awsCreds)
93114
if err != nil {
94115
exit.Error(err)
@@ -119,6 +140,30 @@ var _updateCmd = &cobra.Command{
119140
exit.Error(err)
120141
}
121142

143+
accessConfig, err := getClusterAccessConfig()
144+
if err != nil {
145+
exit.Error(err)
146+
}
147+
148+
awsClient, err := newAWSClient(*accessConfig.Region, awsCreds)
149+
if err != nil {
150+
exit.Error(err)
151+
}
152+
153+
clusterState, err := clusterstate.GetClusterState(awsClient, accessConfig)
154+
if err != nil {
155+
if errors.GetKind(err) == clusterstate.ErrUnexpectedCloudFormationStatus {
156+
fmt.Println(clusterState.TableString())
157+
fmt.Println(fmt.Sprintf("cluster %s in %s is in an unexpected state, please run `cortex cluster down` to delete the cluster or delete the cloudformation stacks manually in your AWS console %s", *accessConfig.ClusterName, *accessConfig.Region, getCloudFormationURLWithAccessConfig(accessConfig)))
158+
}
159+
exit.Error(err)
160+
}
161+
162+
err = assertClusterStatus(accessConfig, clusterState.Status, clusterstate.StatusCreateComplete)
163+
if err != nil {
164+
exit.Error(err)
165+
}
166+
122167
cachedClusterConfig := refreshCachedClusterConfig(awsCreds)
123168

124169
clusterConfig, err := getClusterUpdateConfig(cachedClusterConfig, awsCreds)
@@ -155,64 +200,16 @@ var _infoCmd = &cobra.Command{
155200
exit.Error(err)
156201
}
157202

158-
if _flagDebug {
159-
accessConfig, err := getClusterAccessConfig()
160-
if err != nil {
161-
exit.Error(err)
162-
}
163-
164-
out, exitCode, err := runManagerAccessCommand("/root/debug.sh", *accessConfig, awsCreds)
165-
if err != nil {
166-
exit.Error(err)
167-
}
168-
if exitCode == nil || *exitCode != 0 {
169-
exit.Error(ErrorClusterDebug(out))
170-
}
171-
172-
timestamp := time.Now().UTC().Format("2006-01-02-15-04-05")
173-
userDebugPath := fmt.Sprintf("cortex-debug-%s.tgz", timestamp) // note: if modifying this string, also change it in files.IgnoreCortexDebug()
174-
err = os.Rename(_debugPath, userDebugPath)
175-
if err != nil {
176-
exit.Error(errors.WithStack(err))
177-
}
178-
179-
fmt.Println("saved cluster info to ./" + userDebugPath)
180-
return
181-
}
182-
183-
clusterConfig := refreshCachedClusterConfig(awsCreds)
184-
185-
out, exitCode, err := runManagerAccessCommand("/root/info.sh", clusterConfig.ToAccessConfig(), awsCreds)
203+
accessConfig, err := getClusterAccessConfig()
186204
if err != nil {
187205
exit.Error(err)
188206
}
189-
if exitCode == nil || *exitCode != 0 {
190-
exit.Error(ErrorClusterInfo(out))
191-
}
192-
193-
fmt.Println()
194-
195-
httpResponse, err := HTTPGet("/info")
196-
if err != nil {
197-
fmt.Println(clusterConfig.UserStr())
198-
fmt.Println("\n" + errors.Message(err, "unable to connect to operator"))
199-
return
200-
}
201207

202-
var infoResponse schema.InfoResponse
203-
err = json.Unmarshal(httpResponse, &infoResponse)
204-
if err != nil {
205-
fmt.Println(clusterConfig.UserStr())
206-
fmt.Println("\n" + errors.Message(err, "unable to parse operator response"))
207-
return
208+
if _flagDebug {
209+
cmdDebug(awsCreds, accessConfig)
210+
} else {
211+
cmdInfo(awsCreds, accessConfig)
208212
}
209-
infoResponse.ClusterConfig.Config = clusterConfig
210-
211-
var items table.KeyValuePairs
212-
items.Add("aws access key id", infoResponse.MaskedAWSAccessKeyID)
213-
items.AddAll(infoResponse.ClusterConfig.UserTable())
214-
215-
items.Print()
216213
},
217214
}
218215

@@ -244,14 +241,30 @@ var _downCmd = &cobra.Command{
244241
}
245242
warnIfNotAdmin(awsClient)
246243

244+
clusterState, err := clusterstate.GetClusterState(awsClient, accessConfig)
245+
if err != nil {
246+
if errors.GetKind(err) == clusterstate.ErrUnexpectedCloudFormationStatus {
247+
fmt.Println(clusterState.TableString())
248+
fmt.Println(fmt.Sprintf("cluster %s in %s is in an unexpected state, please delete the cloudformation stacks manually in your AWS console %s", *accessConfig.ClusterName, *accessConfig.Region, getCloudFormationURLWithAccessConfig(accessConfig)))
249+
}
250+
exit.Error(err)
251+
}
252+
253+
switch clusterState.Status {
254+
case clusterstate.StatusNotFound:
255+
exit.Error(ErrorClusterDoesNotExist(*accessConfig.ClusterName, *accessConfig.Region))
256+
case clusterstate.StatusDeleteComplete:
257+
exit.Error(ErrorClusterAlreadyDeleted(*accessConfig.ClusterName, *accessConfig.Region))
258+
}
259+
247260
prompt.YesOrExit(fmt.Sprintf("your cluster (%s in %s) will be spun down and all apis will be deleted, are you sure you want to continue?", *accessConfig.ClusterName, *accessConfig.Region), "", "")
248261

249262
out, exitCode, err := runManagerAccessCommand("/root/uninstall.sh", *accessConfig, awsCreds)
250263
if err != nil {
251264
exit.Error(err)
252265
}
253266
if exitCode == nil || *exitCode != 0 {
254-
helpStr := fmt.Sprintf("\nNote: if this error cannot be resolved, please ensure that all CloudFormation stacks for this cluster eventually become been fully deleted (https://console.aws.amazon.com/cloudformation/home?region=%s#/stacks?filteringText=-%s-). If the stack deletion process has failed, please manually delete the stack from the AWS console (this may require manually deleting particular AWS resources that are blocking the stack deletion)", *accessConfig.Region, *accessConfig.ClusterName)
267+
helpStr := fmt.Sprintf("\nNote: if this error cannot be resolved, please ensure that all CloudFormation stacks for this cluster eventually become been fully deleted (%s). If the stack deletion process has failed, please manually delete the stack from the AWS console (this may require manually deleting particular AWS resources that are blocking the stack deletion)", getCloudFormationURLWithAccessConfig(accessConfig))
255268
fmt.Println(helpStr)
256269
exit.Error(ErrorClusterDown(out + helpStr))
257270
}
@@ -304,6 +317,87 @@ func promptForEmail() {
304317
}
305318
}
306319

320+
func cmdInfo(awsCreds AWSCredentials, accessConfig *clusterconfig.AccessConfig) {
321+
awsClient, err := newAWSClient(*accessConfig.Region, awsCreds)
322+
if err != nil {
323+
exit.Error(err)
324+
}
325+
326+
clusterState, err := clusterstate.GetClusterState(awsClient, accessConfig)
327+
if err != nil {
328+
if errors.GetKind(err) == clusterstate.ErrUnexpectedCloudFormationStatus {
329+
fmt.Println(clusterState.TableString())
330+
fmt.Println(fmt.Sprintf("cluster %s in %s is in an unexpected state, please run `cortex cluster down` to delete the cluster or delete the cloudformation stacks manually in your AWS console %s", *accessConfig.ClusterName, *accessConfig.Region, getCloudFormationURLWithAccessConfig(accessConfig)))
331+
}
332+
exit.Error(err)
333+
}
334+
335+
fmt.Println(clusterState.TableString())
336+
if clusterState.Status == clusterstate.StatusCreateFailed || clusterState.Status == clusterstate.StatusDeleteFailed {
337+
fmt.Println(fmt.Sprintf("More information can be found in your AWS console %s", getCloudFormationURLWithAccessConfig(accessConfig)))
338+
fmt.Println()
339+
}
340+
341+
err = assertClusterStatus(accessConfig, clusterState.Status, clusterstate.StatusCreateComplete)
342+
if err != nil {
343+
exit.Error(err)
344+
}
345+
346+
clusterConfig := refreshCachedClusterConfig(awsCreds)
347+
348+
out, exitCode, err := runManagerAccessCommand("/root/info.sh", *accessConfig, awsCreds)
349+
if err != nil {
350+
exit.Error(err)
351+
}
352+
if exitCode == nil || *exitCode != 0 {
353+
exit.Error(ErrorClusterInfo(out))
354+
}
355+
356+
fmt.Println()
357+
358+
httpResponse, err := HTTPGet("/info")
359+
if err != nil {
360+
fmt.Println(clusterConfig.UserStr())
361+
fmt.Println("\n" + errors.Message(err, "unable to connect to operator"))
362+
return
363+
}
364+
365+
var infoResponse schema.InfoResponse
366+
err = json.Unmarshal(httpResponse, &infoResponse)
367+
if err != nil {
368+
fmt.Println(clusterConfig.UserStr())
369+
fmt.Println("\n" + errors.Message(err, "unable to parse operator response"))
370+
return
371+
}
372+
infoResponse.ClusterConfig.Config = clusterConfig
373+
374+
var items table.KeyValuePairs
375+
items.Add("aws access key id", infoResponse.MaskedAWSAccessKeyID)
376+
items.AddAll(infoResponse.ClusterConfig.UserTable())
377+
378+
items.Print()
379+
}
380+
381+
func cmdDebug(awsCreds AWSCredentials, accessConfig *clusterconfig.AccessConfig) {
382+
out, exitCode, err := runManagerAccessCommand("/root/debug.sh", *accessConfig, awsCreds)
383+
if err != nil {
384+
exit.Error(err)
385+
}
386+
if exitCode == nil || *exitCode != 0 {
387+
exit.Error(ErrorClusterDebug(out))
388+
}
389+
390+
timestamp := time.Now().UTC().Format("2006-01-02-15-04-05")
391+
userDebugPath := fmt.Sprintf("cortex-debug-%s.tgz", timestamp) // note: if modifying this string, also change it in files.IgnoreCortexDebug()
392+
err = os.Rename(_debugPath, userDebugPath)
393+
if err != nil {
394+
exit.Error(errors.WithStack(err))
395+
}
396+
397+
fmt.Println("saved cluster info to ./" + userDebugPath)
398+
return
399+
}
400+
307401
func refreshCachedClusterConfig(awsCreds AWSCredentials) clusterconfig.Config {
308402
accessConfig, err := getClusterAccessConfig()
309403
if err != nil {
@@ -331,3 +425,30 @@ func refreshCachedClusterConfig(awsCreds AWSCredentials) clusterconfig.Config {
331425
readCachedClusterConfigFile(refreshedClusterConfig, cachedConfigPath)
332426
return *refreshedClusterConfig
333427
}
428+
429+
func assertClusterStatus(accessConfig *clusterconfig.AccessConfig, status clusterstate.Status, allowedStatuses ...clusterstate.Status) error {
430+
for _, allowedStatus := range allowedStatuses {
431+
if status == allowedStatus {
432+
return nil
433+
}
434+
}
435+
436+
switch status {
437+
case clusterstate.StatusCreateInProgress:
438+
return ErrorClusterUpInProgress(*accessConfig.ClusterName, *accessConfig.Region)
439+
case clusterstate.StatusCreateComplete:
440+
return ErrorClusterAlreadyCreated(*accessConfig.ClusterName, *accessConfig.Region)
441+
case clusterstate.StatusDeleteInProgress:
442+
return ErrorClusterDownInProgress(*accessConfig.ClusterName, *accessConfig.Region)
443+
case clusterstate.StatusNotFound:
444+
return ErrorClusterDoesNotExist(*accessConfig.ClusterName, *accessConfig.Region)
445+
case clusterstate.StatusDeleteComplete:
446+
return ErrorClusterAlreadyDeleted(*accessConfig.ClusterName, *accessConfig.Region)
447+
default:
448+
return ErrorFailedClusterStatus(status, *accessConfig.ClusterName, *accessConfig.Region)
449+
}
450+
}
451+
452+
func getCloudFormationURLWithAccessConfig(accessConfig *clusterconfig.AccessConfig) string {
453+
return getCloudFormationURL(*accessConfig.ClusterName, *accessConfig.Region)
454+
}

cli/cmd/errors.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/cortexlabs/cortex/pkg/lib/errors"
2424
s "github.com/cortexlabs/cortex/pkg/lib/strings"
2525
"github.com/cortexlabs/cortex/pkg/lib/urls"
26+
"github.com/cortexlabs/cortex/pkg/types/clusterstate"
2627
)
2728

2829
const (
@@ -34,6 +35,10 @@ func errStrFailedToConnect(u url.URL) string {
3435
return "failed to connect to " + urls.TrimQueryParamsURL(u)
3536
}
3637

38+
func getCloudFormationURL(clusterName, region string) string {
39+
return fmt.Sprintf("https://console.aws.amazon.com/cloudformation/home?region=%s#/stacks?filteringText=-%s-", region, clusterName)
40+
}
41+
3742
const (
3843
ErrCLINotConfigured = "cli.cli_not_configured"
3944
ErrCortexYAMLNotFound = "cli.cortex_yaml_not_found"
@@ -55,6 +60,12 @@ const (
5560
ErrClusterDown = "cli.cluster_down"
5661
ErrDuplicateCLIEnvNames = "cli.duplicate_cli_env_names"
5762
ErrInvalidOperatorEndpoint = "cli.invalid_operator_endpoint"
63+
ErrClusterUpInProgress = "cli.cluster_up_in_progress"
64+
ErrClusterAlreadyCreated = "cli.cluster_already_created"
65+
ErrClusterDownInProgress = "cli.cluster_down_in_progress"
66+
ErrClusterAlreadyDeleted = "cli.cluster_already_deleted"
67+
ErrFailedClusterStatus = "cli.failed_cluster_status"
68+
ErrClusterDoesNotExist = "cli.cluster_does_not_exist"
5869
)
5970

6071
func ErrorCLINotConfigured(env string) error {
@@ -219,3 +230,45 @@ func ErrorInvalidOperatorEndpoint(endpoint string) error {
219230
Message: fmt.Sprintf("%s is not a cortex operator endpoint; run `cortex cluster info` to show your operator endpoint or run `cortex cluster up` to spin up a new cluster", endpoint),
220231
})
221232
}
233+
234+
func ErrorClusterDoesNotExist(clusterName string, region string) error {
235+
return errors.WithStack(&errors.Error{
236+
Kind: ErrClusterDoesNotExist,
237+
Message: fmt.Sprintf("cluster %s in %s does not exist", clusterName, region),
238+
})
239+
}
240+
241+
func ErrorClusterUpInProgress(clusterName string, region string) error {
242+
return errors.WithStack(&errors.Error{
243+
Kind: ErrClusterUpInProgress,
244+
Message: fmt.Sprintf("creation of cluster %s in %s is currently in progress", clusterName, region),
245+
})
246+
}
247+
248+
func ErrorClusterAlreadyCreated(clusterName string, region string) error {
249+
return errors.WithStack(&errors.Error{
250+
Kind: ErrClusterAlreadyCreated,
251+
Message: fmt.Sprintf("cluster %s in %s has already been created", clusterName, region),
252+
})
253+
}
254+
255+
func ErrorClusterDownInProgress(clusterName string, region string) error {
256+
return errors.WithStack(&errors.Error{
257+
Kind: ErrClusterDownInProgress,
258+
Message: fmt.Sprintf("deletion of cluster %s in %s is currently in progress", clusterName, region),
259+
})
260+
}
261+
262+
func ErrorClusterAlreadyDeleted(clusterName string, region string) error {
263+
return errors.WithStack(&errors.Error{
264+
Kind: ErrClusterAlreadyDeleted,
265+
Message: fmt.Sprintf("cluster %s in %s has already been deleted or does not exist", clusterName, region),
266+
})
267+
}
268+
269+
func ErrorFailedClusterStatus(status clusterstate.Status, clusterName string, region string) error {
270+
return errors.WithStack(&errors.Error{
271+
Kind: ErrFailedClusterStatus,
272+
Message: fmt.Sprintf("cluster %s in %s encountered an unexpected status %s, please try to delete the cluster with `cortex cluster down` or delete the cloudformation stacks manually in your AWS console %s", clusterName, region, string(status), getCloudFormationURL(clusterName, region)),
273+
})
274+
}

0 commit comments

Comments
 (0)