Skip to content

Enable preemptible instances on GCP #1791

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jan 18, 2021
2 changes: 1 addition & 1 deletion build/images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ dev_images_gcp=(
non_dev_images_cluster=(
"tensorflow-serving-cpu"
"tensorflow-serving-gpu"
"cluster-autoscaler"
"operator"
"istio-proxy"
"istio-pilot"
Expand All @@ -72,7 +73,6 @@ non_dev_images_cluster=(
non_dev_images_aws=(
# includes non_dev_images_cluster
"tensorflow-serving-inf"
"cluster-autoscaler"
"metrics-server"
"inferentia"
"neuron-rtd"
Expand Down
73 changes: 50 additions & 23 deletions cli/cmd/cluster_gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,11 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli
gkeClusterParent := fmt.Sprintf("projects/%s/locations/%s", *clusterConfig.Project, *clusterConfig.Zone)
gkeClusterName := fmt.Sprintf("%s/clusters/%s", gkeClusterParent, clusterConfig.ClusterName)

initialNodeCount := int64(1)
if *clusterConfig.MinInstances > 0 {
initialNodeCount = *clusterConfig.MinInstances
}

gkeClusterConfig := containerpb.Cluster{
Name: clusterConfig.ClusterName,
InitialClusterVersion: "1.17",
Expand All @@ -449,34 +454,56 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli
},
InitialNodeCount: 1,
},
{
Name: "ng-cortex-worker-on-demand",
Config: &containerpb.NodeConfig{
MachineType: *clusterConfig.InstanceType,
Labels: nodeLabels,
Taints: []*containerpb.NodeTaint{
{
Key: "workload",
Value: "true",
Effect: containerpb.NodeTaint_NO_SCHEDULE,
},
},
Locations: []string{*clusterConfig.Zone},
}

if clusterConfig.Preemptible {
gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{
Name: "ng-cortex-wk-preemp",
Config: &containerpb.NodeConfig{
MachineType: *clusterConfig.InstanceType,
Labels: nodeLabels,
Taints: []*containerpb.NodeTaint{
{
Key: "workload",
Value: "true",
Effect: containerpb.NodeTaint_NO_SCHEDULE,
},
Accelerators: accelerators,
OauthScopes: []string{
"https://www.googleapis.com/auth/compute",
"https://www.googleapis.com/auth/devstorage.read_only",
},
Accelerators: accelerators,
OauthScopes: []string{
"https://www.googleapis.com/auth/compute",
"https://www.googleapis.com/auth/devstorage.read_only",
},
ServiceAccount: gcpClient.ClientEmail,
Preemptible: true,
},
InitialNodeCount: int32(initialNodeCount),
})
}
if clusterConfig.OnDemandBackup || !clusterConfig.Preemptible {
gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{
Name: "ng-cortex-wk-on-dmd",
Config: &containerpb.NodeConfig{
MachineType: *clusterConfig.InstanceType,
Labels: nodeLabels,
Taints: []*containerpb.NodeTaint{
{
Key: "workload",
Value: "true",
Effect: containerpb.NodeTaint_NO_SCHEDULE,
},
ServiceAccount: gcpClient.ClientEmail,
},
Autoscaling: &containerpb.NodePoolAutoscaling{
Enabled: true,
MinNodeCount: int32(*clusterConfig.MinInstances),
MaxNodeCount: int32(*clusterConfig.MaxInstances),
Accelerators: accelerators,
OauthScopes: []string{
"https://www.googleapis.com/auth/compute",
"https://www.googleapis.com/auth/devstorage.read_only",
},
InitialNodeCount: int32(*clusterConfig.MinInstances),
ServiceAccount: gcpClient.ClientEmail,
},
},
Locations: []string{*clusterConfig.Zone},
InitialNodeCount: int32(initialNodeCount),
})
}

if clusterConfig.Network != nil {
Expand Down
7 changes: 7 additions & 0 deletions docs/clusters/gcp/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ min_instances: 1
# maximum number of instances
max_instances: 5

# enable the use of preemptible instances
preemptible: false

# enable the use of on-demand backup instances which will be used when preemptible capacity runs out
# default is true when preemptible instances are used
# on_demand_backup: true

# GPU to attach to your instance (optional)
# accelerator_type: nvidia-tesla-t4

Expand Down
5 changes: 5 additions & 0 deletions manager/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@ function cluster_up_gcp() {
kubectl apply -f /workspace/apis.yaml >/dev/null
echo "✓"

echo -n "○ configuring autoscaling "
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/cluster-autoscaler.yaml.j2 > /workspace/cluster-autoscaler.yaml
kubectl apply -f /workspace/cluster-autoscaler.yaml >/dev/null
echo "✓"

echo -n "○ configuring logging "
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/fluent-bit.yaml.j2 > /workspace/fluent-bit.yaml
kubectl apply -f /workspace/fluent-bit.yaml >/dev/null
Expand Down
23 changes: 21 additions & 2 deletions manager/manifests/cluster-autoscaler.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -131,18 +131,25 @@ subjects:
name: cluster-autoscaler
namespace: kube-system
---
{% if config.get('spot_config') is not none and config['spot_config'].get('on_demand_backup', false) %}
{% if (config.get('spot_config') and config['spot_config'].get('on_demand_backup', false)) or config.get('on_demand_backup') %}
apiVersion: v1
kind: ConfigMap
metadata:
name: cluster-autoscaler-priority-expander
namespace: kube-system
data:
priorities: |-
{% if config.get('spot_config') %}
10:
- .*ng-cortex-worker-on-demand.*
50:
- .*ng-cortex-worker-spot.*
{% else %}
10:
- .*ng-cortex-wk-on-dmd.*
50:
- .*ng-cortex-wk-preemp.*
{% endif %}
---
{% endif %}
apiVersion: apps/v1
Expand Down Expand Up @@ -177,9 +184,13 @@ spec:
- ./cluster-autoscaler
- --v=4
- --stderrthreshold=info
{% if config["provider"] == "aws" %}
- --cloud-provider=aws
{% else %}
- --cloud-provider=gce
{% endif %}
- --skip-nodes-with-local-storage=false
{% if config.get('spot_config') is not none and config['spot_config'].get('on_demand_backup', false) %}
{% if (config.get('spot_config') and config['spot_config'].get('on_demand_backup', false)) or config.get('on_demand_backup') %}
- --expander=priority
{% else %}
- --expander=least-waste
Expand All @@ -189,7 +200,11 @@ spec:
- --ok-total-unready-count=30
- --max-node-provision-time=5m
- --scan-interval=20s
{% if config["provider"] == "aws" %}
- --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/{{ config['cluster_name'] }}
{% else %}
- --node-group-auto-discovery=mig:namePrefix=gke-{{ config['cluster_name'] }}-ng-cortex-wk,min={{ config["min_instances"] }},max={{ config["max_instances"] }}
{% endif %}
volumeMounts:
- name: ssl-certs
mountPath: /etc/ssl/certs/ca-certificates.crt
Expand All @@ -198,7 +213,11 @@ spec:
volumes:
- name: ssl-certs
hostPath:
{% if config["provider"] == "aws" %}
path: "/etc/ssl/certs/ca-bundle.crt"
{% else %}
path: "/etc/ssl/certs/ca-certificates.crt"
{% endif %}
strategy:
type: RollingUpdate
rollingUpdate:
Expand Down
1 change: 0 additions & 1 deletion pkg/types/clusterconfig/cluster_config_aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -1155,7 +1155,6 @@ func (cc *Config) UserTable() table.KeyValuePairs {
items.Add(InstanceVolumeTypeUserKey, cc.InstanceVolumeType)
items.Add(InstanceVolumeIOPSUserKey, cc.InstanceVolumeIOPS)
items.Add(SpotUserKey, s.YesNo(*cc.Spot))

if cc.Spot != nil && *cc.Spot {
items.Add(InstanceDistributionUserKey, cc.SpotConfig.InstanceDistribution)
items.Add(OnDemandBaseCapacityUserKey, *cc.SpotConfig.OnDemandBaseCapacity)
Expand Down
38 changes: 38 additions & 0 deletions pkg/types/clusterconfig/cluster_config_gcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/cortexlabs/cortex/pkg/lib/pointer"
"github.com/cortexlabs/cortex/pkg/lib/prompt"
"github.com/cortexlabs/cortex/pkg/lib/slices"
s "github.com/cortexlabs/cortex/pkg/lib/strings"
"github.com/cortexlabs/cortex/pkg/lib/table"
"github.com/cortexlabs/cortex/pkg/types"
)
Expand All @@ -45,11 +46,14 @@ type GCPConfig struct {
OperatorLoadBalancerScheme LoadBalancerScheme `json:"operator_load_balancer_scheme" yaml:"operator_load_balancer_scheme"`
MinInstances *int64 `json:"min_instances" yaml:"min_instances"`
MaxInstances *int64 `json:"max_instances" yaml:"max_instances"`
Preemptible bool `json:"preemptible" yaml:"preemptible"`
OnDemandBackup bool `json:"on_demand_backup" yaml:"on_demand_backup"`
ClusterName string `json:"cluster_name" yaml:"cluster_name"`
Telemetry bool `json:"telemetry" yaml:"telemetry"`
ImageOperator string `json:"image_operator" yaml:"image_operator"`
ImageManager string `json:"image_manager" yaml:"image_manager"`
ImageDownloader string `json:"image_downloader" yaml:"image_downloader"`
ImageClusterAutoscaler string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"`
ImageFluentBit string `json:"image_fluent_bit" yaml:"image_fluent_bit"`
ImageIstioProxy string `json:"image_istio_proxy" yaml:"image_istio_proxy"`
ImageIstioPilot string `json:"image_istio_pilot" yaml:"image_istio_pilot"`
Expand Down Expand Up @@ -206,6 +210,20 @@ var UserGCPValidation = &cr.StructValidation{
Validator: validateClusterName,
},
},
{
StructField: "Preemptible",
BoolValidation: &cr.BoolValidation{
Default: false,
},
},
{
StructField: "OnDemandBackup",
DefaultDependentFields: []string{"Preemptible"},
DefaultDependentFieldsFunc: func(vals []interface{}) interface{} {
return vals[0].(bool)
},
BoolValidation: &cr.BoolValidation{},
},
{
StructField: "Project",
StringPtrValidation: &cr.StringPtrValidation{},
Expand Down Expand Up @@ -235,6 +253,13 @@ var UserGCPValidation = &cr.StructValidation{
Validator: validateImageVersion,
},
},
{
StructField: "ImageClusterAutoscaler",
StringValidation: &cr.StringValidation{
Default: "quay.io/cortexlabs/cluster-austoscaler:" + consts.CortexVersion,
Validator: validateImageVersion,
},
},
{
StructField: "ImageFluentBit",
StringValidation: &cr.StringValidation{
Expand Down Expand Up @@ -387,6 +412,10 @@ func (cc *GCPConfig) Validate(GCP *gcp.Client) error {
}
}

if !cc.Preemptible && cc.OnDemandBackup {
return ErrorFieldConfigurationDependentOnCondition(OnDemandBackupKey, s.Bool(cc.OnDemandBackup), PreemptibleKey, s.Bool(cc.Preemptible))
}

return nil
}

Expand Down Expand Up @@ -490,6 +519,7 @@ func SetGCPDefaults(cc *GCPConfig) error {
if errors.HasError(errs) {
return errors.FirstError(errs...)
}

return nil
}

Expand Down Expand Up @@ -542,6 +572,8 @@ func (cc *GCPConfig) UserTable() table.KeyValuePairs {
if cc.AcceleratorsPerInstance != nil {
items.Add(AcceleratorsPerInstanceUserKey, *cc.AcceleratorsPerInstance)
}
items.Add(PreemptibleUserKey, s.YesNo(cc.Preemptible))
items.Add(OnDemandBackupUserKey, s.YesNo(cc.OnDemandBackup))
if cc.Network != nil {
items.Add(NetworkUserKey, *cc.Network)
}
Expand All @@ -554,6 +586,7 @@ func (cc *GCPConfig) UserTable() table.KeyValuePairs {
items.Add(ImageOperatorUserKey, cc.ImageOperator)
items.Add(ImageManagerUserKey, cc.ImageManager)
items.Add(ImageDownloaderUserKey, cc.ImageDownloader)
items.Add(ImageClusterAutoscalerUserKey, cc.ImageClusterAutoscaler)
items.Add(ImageFluentBitUserKey, cc.ImageFluentBit)
items.Add(ImageIstioProxyUserKey, cc.ImageIstioProxy)
items.Add(ImageIstioPilotUserKey, cc.ImageIstioPilot)
Expand Down Expand Up @@ -602,6 +635,8 @@ func (cc *GCPConfig) TelemetryEvent() map[string]interface{} {
if cc.ClusterName != "cortex" {
event["cluster_name._is_custom"] = true
}
event["preemptible"] = cc.Preemptible
event["on_demand_backup"] = cc.OnDemandBackup
if cc.Zone != nil {
event["zone._is_defined"] = true
event["zone"] = *cc.Zone
Expand All @@ -615,6 +650,9 @@ func (cc *GCPConfig) TelemetryEvent() map[string]interface{} {
if !strings.HasPrefix(cc.ImageDownloader, "cortexlabs/") {
event["image_downloader._is_custom"] = true
}
if !strings.HasPrefix(cc.ImageClusterAutoscaler, "cortexlabs/") {
event["image_cluster_autoscaler._is_custom"] = true
}
if !strings.HasPrefix(cc.ImageFluentBit, "cortexlabs/") {
event["image_fluent_bit._is_custom"] = true
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/types/clusterconfig/config_key.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ const (
InstanceVolumeIOPSKey = "instance_volume_iops"
SpotKey = "spot"
SpotConfigKey = "spot_config"
PreemptibleKey = "preemptible"
InstanceDistributionKey = "instance_distribution"
OnDemandBaseCapacityKey = "on_demand_base_capacity"
OnDemandPercentageAboveBaseCapacityKey = "on_demand_percentage_above_base_capacity"
Expand Down Expand Up @@ -82,6 +83,7 @@ const (
SSLCertificateARNUserKey = "ssl certificate arn"
BucketUserKey = "s3 bucket"
SpotUserKey = "use spot instances"
PreemptibleUserKey = "use preemptible instances"
InstanceTypeUserKey = "instance type"
AcceleratorTypeUserKey = "accelerator type"
AcceleratorsPerInstanceUserKey = "accelerators per instance"
Expand Down
8 changes: 8 additions & 0 deletions pkg/types/clusterconfig/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ const (
ErrNoNATGatewayWithSubnets = "clusterconfig.no_nat_gateway_with_subnets"
ErrSpecifyOneOrNone = "clusterconfig.specify_one_or_none"
ErrDependentFieldMustBeSpecified = "clusterconfig.dependent_field_must_be_specified"
ErrFieldConfigurationDependentOnCondition = "clusterconfig.field_configuration_dependent_on_condition"
ErrDidNotMatchStrictS3Regex = "clusterconfig.did_not_match_strict_s3_regex"
ErrNATRequiredWithPrivateSubnetVisibility = "clusterconfig.nat_required_with_private_subnet_visibility"
ErrS3RegionDiffersFromCluster = "clusterconfig.s3_region_differs_from_cluster"
Expand Down Expand Up @@ -249,6 +250,13 @@ func ErrorDependentFieldMustBeSpecified(configuredField string, dependencyField
})
}

func ErrorFieldConfigurationDependentOnCondition(configuredField string, configuredFieldValue string, dependencyField string, dependencyFieldValue string) error {
return errors.WithStack(&errors.Error{
Kind: ErrFieldConfigurationDependentOnCondition,
Message: fmt.Sprintf("cannot set %s = %s when %s = %s", configuredField, configuredFieldValue, dependencyField, dependencyFieldValue),
})
}

func ErrorDidNotMatchStrictS3Regex() error {
return errors.WithStack(&errors.Error{
Kind: ErrDidNotMatchStrictS3Regex,
Expand Down