Skip to content

Remove mechanism for filling the instance distribution w/ compatibles #979

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cli/cmd/lib_cluster_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ func getClusterUpdateConfig(cachedClusterConfig clusterconfig.Config, awsCreds A
userClusterConfig.Spot = cachedClusterConfig.Spot

if userClusterConfig.Spot != nil && *userClusterConfig.Spot {
err = userClusterConfig.AutoFillSpot(awsClient)
err = userClusterConfig.FillEmptySpotFields(awsClient)
if err != nil {
return nil, err
}
Expand Down
4 changes: 2 additions & 2 deletions docs/cluster-management/spot-instances.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ _WARNING: you are on the master branch, please refer to the docs on the branch t
spot: false

spot_config:
# additional instances with identical or better specs than the primary instance type (defaults to 2 instances sorted by price)
# additional instances with identical or better specs than the primary instance type (defaults to only the primary instance)
instance_distribution: [similar_instance_type_1, similar_instance_type_2]

# minimum number of on demand instances (default: 0)
Expand All @@ -31,7 +31,7 @@ spot_config:
on_demand_backup: true
```

Spot instances are not guaranteed to be available. The chances of getting spot instances can be improved by providing `instance_distribution`, a list of alternative instance types to the primary `instance_type` you specified. If left blank, Cortex will autofill `instance_distribution` with up to 2 other similar instances. Cortex defaults the `max_price` to the on-demand price of the primary instance.
Spot instances are not guaranteed to be available. The chances of getting spot instances can be improved by providing `instance_distribution`, a list of alternative instance types to the primary `instance_type` you specified. If left blank, Cortex will only include the primary instance type in the `instance_distribution`. Cortex defaults the `max_price` to the on-demand price of the primary instance.

Spot instances can be mixed with on-demand instances by configuring `on_demand_base_capacity` and `on_demand_percentage_above_base_capacity`. `on_demand_base_capacity` enforces the minimum number of nodes that will be fulfilled by on-demand instances as your cluster is scaling up. `on_demand_percentage_above_base_capacity` defines the percentage of instances that will be on-demand after the base capacity has been fulfilled (the rest being spot instances). `instance_pools` is the number of pools per availability zone to allocate your instances from. See [here](https://docs.aws.amazon.com/autoscaling/ec2/APIReference/API_InstancesDistribution.html) for more details.

Expand Down
98 changes: 15 additions & 83 deletions pkg/types/clusterconfig/clusterconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package clusterconfig
import (
"fmt"
"regexp"
"sort"
"strings"

"github.com/aws/amazon-vpc-cni-k8s/pkg/awsutils"
Expand Down Expand Up @@ -412,11 +411,12 @@ func (cc *Config) Validate(awsClient *aws.Client) error {
return ErrorS3RegionDiffersFromCluster(cc.Bucket, bucketRegion, *cc.Region)
}

if _, ok := aws.InstanceMetadatas[*cc.Region][*cc.InstanceType]; !ok {
return errors.Wrap(ErrorInstanceTypeNotSupportedInRegion(*cc.InstanceType, *cc.Region), InstanceTypeKey)
primaryInstanceType := *cc.InstanceType
if _, ok := aws.InstanceMetadatas[*cc.Region][primaryInstanceType]; !ok {
return errors.Wrap(ErrorInstanceTypeNotSupportedInRegion(primaryInstanceType, *cc.Region), InstanceTypeKey)
}

if err := awsClient.VerifyInstanceQuota(*cc.InstanceType); err != nil {
if err := awsClient.VerifyInstanceQuota(primaryInstanceType); err != nil {
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
if _, ok := errors.CauseOrSelf(err).(awserr.Error); !ok {
return errors.Wrap(err, InstanceTypeKey)
Expand All @@ -428,44 +428,29 @@ func (cc *Config) Validate(awsClient *aws.Client) error {
}

if cc.Spot != nil && *cc.Spot {
cc.AutoFillSpot(awsClient)
chosenInstance := aws.InstanceMetadatas[*cc.Region][*cc.InstanceType]
compatibleSpots := CompatibleSpotInstances(awsClient, chosenInstance, cc.SpotConfig.MaxPrice, _spotInstanceDistributionLength)
if len(compatibleSpots) == 0 {
return errors.Wrap(ErrorNoCompatibleSpotInstanceFound(chosenInstance.Type), InstanceTypeKey)
}
cc.FillEmptySpotFields(awsClient)

compatibleInstanceCount := 0
primaryInstance := aws.InstanceMetadatas[*cc.Region][primaryInstanceType]
for _, instanceType := range cc.SpotConfig.InstanceDistribution {
if instanceType == *cc.InstanceType {
if instanceType == primaryInstanceType {
continue
}
if _, ok := aws.InstanceMetadatas[*cc.Region][instanceType]; !ok {
return errors.Wrap(ErrorInstanceTypeNotSupportedInRegion(instanceType, *cc.Region), InstanceDistributionKey)
return errors.Wrap(ErrorInstanceTypeNotSupportedInRegion(instanceType, *cc.Region), SpotConfigKey, InstanceDistributionKey)
}

instanceMetadata := aws.InstanceMetadatas[*cc.Region][instanceType]
err := CheckSpotInstanceCompatibility(chosenInstance, instanceMetadata)
err := CheckSpotInstanceCompatibility(primaryInstance, instanceMetadata)
if err != nil {
return errors.Wrap(err, InstanceDistributionKey)
return errors.Wrap(err, SpotConfigKey, InstanceDistributionKey)
}

spotInstancePrice, awsErr := awsClient.SpotInstancePrice(instanceMetadata.Region, instanceMetadata.Type)
if awsErr == nil {
if err := CheckSpotInstancePriceCompatibility(chosenInstance, instanceMetadata, cc.SpotConfig.MaxPrice, spotInstancePrice); err != nil {
return errors.Wrap(err, InstanceDistributionKey)
if err := CheckSpotInstancePriceCompatibility(primaryInstance, instanceMetadata, cc.SpotConfig.MaxPrice, spotInstancePrice); err != nil {
return errors.Wrap(err, SpotConfigKey, InstanceDistributionKey)
}
}

compatibleInstanceCount++
}

if compatibleInstanceCount == 0 {
suggestions := []string{}
for _, compatibleInstance := range compatibleSpots {
suggestions = append(suggestions, compatibleInstance.Type)
}
return ErrorAtLeastOneInstanceDistribution(*cc.InstanceType, suggestions[0], suggestions[1:]...)
}

if cc.SpotConfig.OnDemandBaseCapacity != nil && *cc.SpotConfig.OnDemandBaseCapacity > *cc.MaxInstances {
Expand Down Expand Up @@ -524,50 +509,8 @@ func CheckSpotInstancePriceCompatibility(target aws.InstanceMetadata, suggested
return nil
}

func CompatibleSpotInstances(awsClient *aws.Client, targetInstance aws.InstanceMetadata, maxPrice *float64, numInstances int) []aws.InstanceMetadata {
compatibleInstances := []aws.InstanceMetadata{}
instanceMap := aws.InstanceMetadatas[targetInstance.Region]
availableInstances := []aws.InstanceMetadata{}

for instanceType, instanceMetadata := range instanceMap {
if instanceType == targetInstance.Type {
continue
}
availableInstances = append(availableInstances, instanceMetadata)
}

sort.Slice(availableInstances, func(i, j int) bool {
return availableInstances[i].Price < availableInstances[j].Price
})

for _, instanceMetadata := range availableInstances {
if err := CheckCortexSupport(instanceMetadata); err != nil {
continue
}

if err := CheckSpotInstanceCompatibility(targetInstance, instanceMetadata); err != nil {
continue
}

spotInstancePrice, awsErr := awsClient.SpotInstancePrice(instanceMetadata.Region, instanceMetadata.Type)
if awsErr == nil {
if err := CheckSpotInstancePriceCompatibility(targetInstance, instanceMetadata, maxPrice, spotInstancePrice); err != nil {
continue
}
}

compatibleInstances = append(compatibleInstances, instanceMetadata)

if len(compatibleInstances) == numInstances {
break
}
}

return compatibleInstances
}

func AutoGenerateSpotConfig(awsClient *aws.Client, spotConfig *SpotConfig, region string, instanceType string) error {
chosenInstance := aws.InstanceMetadatas[region][instanceType]
primaryInstance := aws.InstanceMetadatas[region][instanceType]
cleanedDistribution := []string{instanceType}
for _, spotInstance := range spotConfig.InstanceDistribution {
if spotInstance != instanceType {
Expand All @@ -576,19 +519,8 @@ func AutoGenerateSpotConfig(awsClient *aws.Client, spotConfig *SpotConfig, regio
}
spotConfig.InstanceDistribution = cleanedDistribution

if len(spotConfig.InstanceDistribution) == 1 {
compatibleSpots := CompatibleSpotInstances(awsClient, chosenInstance, spotConfig.MaxPrice, _spotInstanceDistributionLength)
if len(compatibleSpots) == 0 {
return errors.Wrap(ErrorNoCompatibleSpotInstanceFound(chosenInstance.Type), InstanceTypeKey)
}

for _, instance := range compatibleSpots {
spotConfig.InstanceDistribution = append(spotConfig.InstanceDistribution, instance.Type)
}
}

if spotConfig.MaxPrice == nil {
spotConfig.MaxPrice = &chosenInstance.Price
spotConfig.MaxPrice = &primaryInstance.Price
}

if spotConfig.OnDemandBaseCapacity == nil {
Expand All @@ -614,7 +546,7 @@ func AutoGenerateSpotConfig(awsClient *aws.Client, spotConfig *SpotConfig, regio
return nil
}

func (cc *Config) AutoFillSpot(awsClient *aws.Client) error {
func (cc *Config) FillEmptySpotFields(awsClient *aws.Client) error {
if cc.SpotConfig == nil {
cc.SpotConfig = &SpotConfig{}
}
Expand Down
18 changes: 1 addition & 17 deletions pkg/types/clusterconfig/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ func ErrorIncompatibleSpotInstanceTypeGPU(target aws.InstanceMetadata, suggested
func ErrorSpotPriceGreaterThanTargetOnDemand(suggestedSpotPrice float64, target aws.InstanceMetadata, suggested aws.InstanceMetadata) error {
return errors.WithStack(&errors.Error{
Kind: ErrSpotPriceGreaterThanTargetOnDemand,
Message: fmt.Sprintf("%s will not be allocated because its current spot price is $%g which is greater than than %s's on-demand price of $%g", suggested.Type, suggestedSpotPrice, target.Type, target.Price),
Message: fmt.Sprintf("%s will not be allocated because its current spot price is $%g which is greater than %s's on-demand price of $%g", suggested.Type, suggestedSpotPrice, target.Type, target.Price),
})
}

Expand All @@ -120,22 +120,6 @@ func ErrorInstanceTypeNotSupported(instanceType string) error {
})
}

func ErrorAtLeastOneInstanceDistribution(instanceType string, suggestion string, suggestions ...string) error {
allSuggestions := append(suggestions, suggestion)
message := strings.Join(allSuggestions, ", ")
return errors.WithStack(&errors.Error{
Kind: ErrAtLeastOneInstanceDistribution,
Message: fmt.Sprintf("at least one compatible instance type other than %s must be specified (suggestions: %s)", instanceType, message),
})
}

func ErrorNoCompatibleSpotInstanceFound(instanceType string) error {
return errors.WithStack(&errors.Error{
Kind: ErrNoCompatibleSpotInstanceFound,
Message: fmt.Sprintf("unable to find compatible spot instance types for %s", instanceType),
})
}

func ErrorConfiguredWhenSpotIsNotEnabled(configKey string) error {
return errors.WithStack(&errors.Error{
Kind: ErrConfiguredWhenSpotIsNotEnabled,
Expand Down