Skip to content

Commit 010221f

Browse files
authored
Add support for multi-instance-type clusters to AWS/GCP providers (#1951)
1 parent c50f8aa commit 010221f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1624
-1576
lines changed

.circleci/config.yml

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -146,15 +146,27 @@ jobs:
146146
echo 'export AWS_SECRET_ACCESS_KEY=${NIGHTLY_AWS_SECRET_ACCESS_KEY}' >> $BASH_ENV
147147
- run:
148148
name: Generate Cluster Config
149+
# using a variety of node groups to test the multi-instance-type cluster functionality
149150
command: |
150151
cat \<< EOF > ./cluster.yaml
151-
cluster_name: cortex
152152
provider: aws
153+
cluster_name: cortex
153154
region: us-east-1
154-
instance_type: g4dn.xlarge
155-
min_instances: 1
156-
max_instances: 2
157155
bucket: cortex-dev-nightly
156+
node_groups:
157+
- name: spot
158+
instance_type: t3.medium
159+
min_instances: 0
160+
max_instances: 1
161+
spot: true
162+
- name: cpu
163+
instance_type: c5.xlarge
164+
min_instances: 1
165+
max_instances: 2
166+
- name: gpu
167+
instance_type: g4dn.xlarge
168+
min_instances: 1
169+
max_instances: 2
158170
EOF
159171
- run-e2e-tests:
160172
provider: aws
@@ -174,16 +186,28 @@ jobs:
174186
echo 'export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/google_service_account.json' >> $BASH_ENV
175187
- run:
176188
name: Generate Cluster Config
189+
# using a variety of node pools to test the multi-instance-type cluster functionality
177190
command: |
178191
cat \<< EOF > ./cluster.yaml
192+
provider: gcp
179193
cluster_name: cortex
180194
project: cortexlabs-dev
181195
zone: us-east1-c
182-
provider: gcp
183-
instance_type: n1-standard-2
184-
accelerator_type: nvidia-tesla-t4
185-
min_instances: 1
186-
max_instances: 2
196+
node_pools:
197+
- name: preemptible
198+
instance_type: n1-standard-2
199+
min_instances: 0
200+
max_instances: 1
201+
preemptible: true
202+
- name: cpu
203+
instance_type: n1-standard-2
204+
min_instances: 1
205+
max_instances: 2
206+
- name: gpu
207+
instance_type: n1-standard-2
208+
accelerator_type: nvidia-tesla-t4
209+
min_instances: 1
210+
max_instances: 2
187211
EOF
188212
- run-e2e-tests:
189213
provider: gcp

cli/cmd/cluster.go

Lines changed: 55 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ var _clusterConfigureCmd = &cobra.Command{
302302
exit.Error(err)
303303
}
304304

305-
accessConfig, err := getClusterAccessConfigWithCache()
305+
accessConfig, err := getNewClusterAccessConfig(clusterConfigFile)
306306
if err != nil {
307307
exit.Error(err)
308308
}
@@ -317,7 +317,7 @@ var _clusterConfigureCmd = &cobra.Command{
317317
exit.Error(err)
318318
}
319319

320-
err = clusterstate.AssertClusterStatus(accessConfig.ClusterName, accessConfig.Region, clusterState.Status, clusterstate.StatusCreateComplete)
320+
err = clusterstate.AssertClusterStatus(accessConfig.ClusterName, accessConfig.Region, clusterState.Status, clusterstate.StatusCreateComplete, clusterstate.StatusUpdateComplete, clusterstate.StatusUpdateRollbackComplete)
321321
if err != nil {
322322
exit.Error(err)
323323
}
@@ -527,7 +527,7 @@ var _clusterExportCmd = &cobra.Command{
527527
exit.Error(err)
528528
}
529529

530-
err = clusterstate.AssertClusterStatus(accessConfig.ClusterName, accessConfig.Region, clusterState.Status, clusterstate.StatusCreateComplete)
530+
err = clusterstate.AssertClusterStatus(accessConfig.ClusterName, accessConfig.Region, clusterState.Status, clusterstate.StatusCreateComplete, clusterstate.StatusUpdateComplete, clusterstate.StatusUpdateRollbackComplete)
531531
if err != nil {
532532
exit.Error(err)
533533
}
@@ -668,7 +668,7 @@ func printInfoClusterState(awsClient *aws.Client, accessConfig *clusterconfig.Ac
668668
fmt.Println()
669669
}
670670

671-
err = clusterstate.AssertClusterStatus(accessConfig.ClusterName, accessConfig.Region, clusterState.Status, clusterstate.StatusCreateComplete)
671+
err = clusterstate.AssertClusterStatus(accessConfig.ClusterName, accessConfig.Region, clusterState.Status, clusterstate.StatusCreateComplete, clusterstate.StatusUpdateComplete, clusterstate.StatusUpdateRollbackComplete)
672672
if err != nil {
673673
return err
674674
}
@@ -679,6 +679,12 @@ func printInfoClusterState(awsClient *aws.Client, accessConfig *clusterconfig.Ac
679679
func printInfoOperatorResponse(clusterConfig clusterconfig.Config, operatorEndpoint string) error {
680680
fmt.Print("fetching cluster status ...\n\n")
681681

682+
yamlBytes, err := yaml.Marshal(clusterConfig)
683+
if err != nil {
684+
return err
685+
}
686+
yamlString := string(yamlBytes)
687+
682688
operatorConfig := cluster.OperatorConfig{
683689
Telemetry: isTelemetryEnabled(),
684690
ClientID: clientID(),
@@ -688,42 +694,67 @@ func printInfoOperatorResponse(clusterConfig clusterconfig.Config, operatorEndpo
688694

689695
infoResponse, err := cluster.Info(operatorConfig)
690696
if err != nil {
691-
fmt.Println(clusterConfig.UserStr())
697+
fmt.Println(yamlString)
692698
return err
693699
}
694700
infoResponse.ClusterConfig.Config = clusterConfig
695701

696-
printInfoClusterConfig(infoResponse)
702+
fmt.Println(console.Bold("metadata:"))
703+
fmt.Println(fmt.Sprintf("aws access key id: %s", infoResponse.MaskedAWSAccessKeyID))
704+
fmt.Println(fmt.Sprintf("%s: %s", clusterconfig.APIVersionUserKey, infoResponse.ClusterConfig.APIVersion))
705+
706+
fmt.Println()
707+
fmt.Println(console.Bold("cluster config:"))
708+
fmt.Print(yamlString)
709+
697710
printInfoPricing(infoResponse, clusterConfig)
698711
printInfoNodes(infoResponse)
699712

700713
return nil
701714
}
702715

703-
func printInfoClusterConfig(infoResponse *schema.InfoResponse) {
704-
var items table.KeyValuePairs
705-
items.Add("aws access key id", infoResponse.MaskedAWSAccessKeyID)
706-
items.AddAll(infoResponse.ClusterConfig.UserTable())
707-
items.Print()
708-
}
709-
710716
func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterconfig.Config) {
711-
numAPIInstances := len(infoResponse.NodeInfos)
712-
713-
var totalAPIInstancePrice float64
714-
for _, nodeInfo := range infoResponse.NodeInfos {
715-
totalAPIInstancePrice += nodeInfo.Price
716-
}
717-
718717
eksPrice := aws.EKSPrices[clusterConfig.Region]
719718
operatorInstancePrice := aws.InstanceMetadatas[clusterConfig.Region]["t3.medium"].Price
720719
operatorEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * 20 / 30 / 24
721720
metricsEBSPrice := aws.EBSMetadatas[clusterConfig.Region]["gp2"].PriceGB * 40 / 30 / 24
722721
nlbPrice := aws.NLBMetadatas[clusterConfig.Region].Price
723722
natUnitPrice := aws.NATMetadatas[clusterConfig.Region].Price
724-
apiEBSPrice := aws.EBSMetadatas[clusterConfig.Region][clusterConfig.InstanceVolumeType.String()].PriceGB * float64(clusterConfig.InstanceVolumeSize) / 30 / 24
725-
if clusterConfig.InstanceVolumeType.String() == "io1" && clusterConfig.InstanceVolumeIOPS != nil {
726-
apiEBSPrice += aws.EBSMetadatas[clusterConfig.Region][clusterConfig.InstanceVolumeType.String()].PriceIOPS * float64(*clusterConfig.InstanceVolumeIOPS) / 30 / 24
723+
724+
headers := []table.Header{
725+
{Title: "aws resource"},
726+
{Title: "cost per hour"},
727+
}
728+
729+
var rows [][]interface{}
730+
rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)})
731+
732+
var totalNodeGroupsPrice float64
733+
for _, ng := range clusterConfig.NodeGroups {
734+
var ngNamePrefix string
735+
if ng.Spot {
736+
ngNamePrefix = "cx-ws-"
737+
} else {
738+
ngNamePrefix = "cx-wd-"
739+
}
740+
nodesInfo := infoResponse.GetNodesWithNodeGroupName(ngNamePrefix + ng.Name)
741+
numInstances := len(nodesInfo)
742+
743+
ebsPrice := aws.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceGB * float64(ng.InstanceVolumeSize) / 30 / 24
744+
if ng.InstanceVolumeType.String() == "io1" && ng.InstanceVolumeIOPS != nil {
745+
ebsPrice += aws.EBSMetadatas[clusterConfig.Region][ng.InstanceVolumeType.String()].PriceIOPS * float64(*ng.InstanceVolumeIOPS) / 30 / 24
746+
}
747+
totalEBSPrice := ebsPrice * float64(numInstances)
748+
749+
totalInstancePrice := float64(0)
750+
for _, nodeInfo := range nodesInfo {
751+
totalInstancePrice += nodeInfo.Price
752+
}
753+
754+
rows = append(rows, []interface{}{fmt.Sprintf("nodegroup %s: %d (out of %d) %s for your apis", ng.Name, numInstances, ng.MaxInstances, s.PluralS("instance", numInstances)), s.DollarsAndTenthsOfCents(totalInstancePrice) + " total"})
755+
rows = append(rows, []interface{}{fmt.Sprintf("nodegroup %s: %d (out of %d) %dgb ebs %s for your apis", ng.Name, numInstances, ng.MaxInstances, ng.InstanceVolumeSize, s.PluralS("volume", numInstances)), s.DollarsAndTenthsOfCents(totalEBSPrice) + " total"})
756+
757+
totalNodeGroupsPrice += totalEBSPrice + totalInstancePrice
727758
}
728759

729760
var natTotalPrice float64
@@ -732,20 +763,9 @@ func printInfoPricing(infoResponse *schema.InfoResponse, clusterConfig clusterco
732763
} else if clusterConfig.NATGateway == clusterconfig.HighlyAvailableNATGateway {
733764
natTotalPrice = natUnitPrice * float64(len(clusterConfig.AvailabilityZones))
734765
}
735-
736-
totalPrice := eksPrice + totalAPIInstancePrice + apiEBSPrice*float64(numAPIInstances) +
737-
operatorInstancePrice*2 + operatorEBSPrice + metricsEBSPrice + nlbPrice*2 + natTotalPrice
766+
totalPrice := eksPrice + totalNodeGroupsPrice + operatorInstancePrice*2 + operatorEBSPrice + metricsEBSPrice + nlbPrice*2 + natTotalPrice
738767
fmt.Printf(console.Bold("\nyour cluster currently costs %s per hour\n\n"), s.DollarsAndCents(totalPrice))
739768

740-
headers := []table.Header{
741-
{Title: "aws resource"},
742-
{Title: "cost per hour"},
743-
}
744-
745-
var rows [][]interface{}
746-
rows = append(rows, []interface{}{"1 eks cluster", s.DollarsMaxPrecision(eksPrice)})
747-
rows = append(rows, []interface{}{fmt.Sprintf("%d %s for your apis", numAPIInstances, s.PluralS("instance", numAPIInstances)), s.DollarsAndTenthsOfCents(totalAPIInstancePrice) + " total"})
748-
rows = append(rows, []interface{}{fmt.Sprintf("%d %dgb ebs %s for your apis", numAPIInstances, clusterConfig.InstanceVolumeSize, s.PluralS("volume", numAPIInstances)), s.DollarsAndTenthsOfCents(apiEBSPrice*float64(numAPIInstances)) + " total"})
749769
rows = append(rows, []interface{}{"2 t3.medium instances for cortex", s.DollarsMaxPrecision(operatorInstancePrice * 2)})
750770
rows = append(rows, []interface{}{"1 20gb ebs volume for the operator", s.DollarsAndTenthsOfCents(operatorEBSPrice)})
751771
rows = append(rows, []interface{}{"1 40gb ebs volume for prometheus", s.DollarsAndTenthsOfCents(metricsEBSPrice)})

cli/cmd/cluster_gcp.go

Lines changed: 71 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
"github.com/cortexlabs/cortex/pkg/lib/telemetry"
3636
"github.com/cortexlabs/cortex/pkg/types"
3737
"github.com/cortexlabs/cortex/pkg/types/clusterconfig"
38+
"github.com/cortexlabs/yaml"
3839
"github.com/spf13/cobra"
3940
containerpb "google.golang.org/genproto/googleapis/container/v1"
4041
)
@@ -373,7 +374,18 @@ func printInfoOperatorResponseGCP(accessConfig *clusterconfig.GCPAccessConfig, o
373374
return err
374375
}
375376

376-
infoResponse.ClusterConfig.UserTable().Print()
377+
yamlBytes, err := yaml.Marshal(infoResponse.ClusterConfig.GCPConfig)
378+
if err != nil {
379+
return err
380+
}
381+
yamlString := string(yamlBytes)
382+
383+
fmt.Println(console.Bold("metadata:"))
384+
fmt.Println(fmt.Sprintf("%s: %s", clusterconfig.APIVersionUserKey, infoResponse.ClusterConfig.APIVersion))
385+
386+
fmt.Println()
387+
fmt.Println(console.Bold("cluster config:"))
388+
fmt.Print(yamlString)
377389

378390
return nil
379391
}
@@ -448,25 +460,9 @@ func updateGCPCLIEnv(envName string, operatorEndpoint string, disallowPrompt boo
448460
func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Client) error {
449461
fmt.Print("○ creating GKE cluster ")
450462

451-
nodeLabels := map[string]string{"workload": "true"}
452-
var accelerators []*containerpb.AcceleratorConfig
453-
454-
if clusterConfig.AcceleratorType != nil {
455-
accelerators = append(accelerators, &containerpb.AcceleratorConfig{
456-
AcceleratorCount: *clusterConfig.AcceleratorsPerInstance,
457-
AcceleratorType: *clusterConfig.AcceleratorType,
458-
})
459-
nodeLabels["nvidia.com/gpu"] = "present"
460-
}
461-
462463
gkeClusterParent := fmt.Sprintf("projects/%s/locations/%s", clusterConfig.Project, clusterConfig.Zone)
463464
gkeClusterName := fmt.Sprintf("%s/clusters/%s", gkeClusterParent, clusterConfig.ClusterName)
464465

465-
initialNodeCount := int64(1)
466-
if clusterConfig.MinInstances > 0 {
467-
initialNodeCount = clusterConfig.MinInstances
468-
}
469-
470466
gkeClusterConfig := containerpb.Cluster{
471467
Name: clusterConfig.ClusterName,
472468
InitialClusterVersion: "1.18",
@@ -488,52 +484,68 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli
488484
Locations: []string{clusterConfig.Zone},
489485
}
490486

491-
if clusterConfig.Preemptible {
492-
gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{
493-
Name: "ng-cortex-wk-preemp",
494-
Config: &containerpb.NodeConfig{
495-
MachineType: clusterConfig.InstanceType,
496-
Labels: nodeLabels,
497-
Taints: []*containerpb.NodeTaint{
498-
{
499-
Key: "workload",
500-
Value: "true",
501-
Effect: containerpb.NodeTaint_NO_SCHEDULE,
487+
for _, nodePool := range clusterConfig.NodePools {
488+
nodeLabels := map[string]string{"workload": "true"}
489+
initialNodeCount := int64(1)
490+
if nodePool.MinInstances > 0 {
491+
initialNodeCount = nodePool.MinInstances
492+
}
493+
494+
var accelerators []*containerpb.AcceleratorConfig
495+
if nodePool.AcceleratorType != nil {
496+
accelerators = append(accelerators, &containerpb.AcceleratorConfig{
497+
AcceleratorCount: *nodePool.AcceleratorsPerInstance,
498+
AcceleratorType: *nodePool.AcceleratorType,
499+
})
500+
nodeLabels["nvidia.com/gpu"] = "present"
501+
}
502+
503+
if nodePool.Preemptible {
504+
gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{
505+
Name: "cx-ws-" + nodePool.Name,
506+
Config: &containerpb.NodeConfig{
507+
MachineType: nodePool.InstanceType,
508+
Labels: nodeLabels,
509+
Taints: []*containerpb.NodeTaint{
510+
{
511+
Key: "workload",
512+
Value: "true",
513+
Effect: containerpb.NodeTaint_NO_SCHEDULE,
514+
},
502515
},
503-
},
504-
Accelerators: accelerators,
505-
OauthScopes: []string{
506-
"https://www.googleapis.com/auth/compute",
507-
"https://www.googleapis.com/auth/devstorage.read_only",
508-
},
509-
ServiceAccount: gcpClient.ClientEmail,
510-
Preemptible: true,
511-
},
512-
InitialNodeCount: int32(initialNodeCount),
513-
})
514-
}
515-
if clusterConfig.OnDemandBackup || !clusterConfig.Preemptible {
516-
gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{
517-
Name: "ng-cortex-wk-on-dmd",
518-
Config: &containerpb.NodeConfig{
519-
MachineType: clusterConfig.InstanceType,
520-
Labels: nodeLabels,
521-
Taints: []*containerpb.NodeTaint{
522-
{
523-
Key: "workload",
524-
Value: "true",
525-
Effect: containerpb.NodeTaint_NO_SCHEDULE,
516+
Accelerators: accelerators,
517+
OauthScopes: []string{
518+
"https://www.googleapis.com/auth/compute",
519+
"https://www.googleapis.com/auth/devstorage.read_only",
526520
},
521+
ServiceAccount: gcpClient.ClientEmail,
522+
Preemptible: true,
527523
},
528-
Accelerators: accelerators,
529-
OauthScopes: []string{
530-
"https://www.googleapis.com/auth/compute",
531-
"https://www.googleapis.com/auth/devstorage.read_only",
524+
InitialNodeCount: int32(initialNodeCount),
525+
})
526+
} else {
527+
gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{
528+
Name: "cx-wd-" + nodePool.Name,
529+
Config: &containerpb.NodeConfig{
530+
MachineType: nodePool.InstanceType,
531+
Labels: nodeLabels,
532+
Taints: []*containerpb.NodeTaint{
533+
{
534+
Key: "workload",
535+
Value: "true",
536+
Effect: containerpb.NodeTaint_NO_SCHEDULE,
537+
},
538+
},
539+
Accelerators: accelerators,
540+
OauthScopes: []string{
541+
"https://www.googleapis.com/auth/compute",
542+
"https://www.googleapis.com/auth/devstorage.read_only",
543+
},
544+
ServiceAccount: gcpClient.ClientEmail,
532545
},
533-
ServiceAccount: gcpClient.ClientEmail,
534-
},
535-
InitialNodeCount: int32(initialNodeCount),
536-
})
546+
InitialNodeCount: int32(initialNodeCount),
547+
})
548+
}
537549
}
538550

539551
if clusterConfig.Network != nil {

0 commit comments

Comments
 (0)