Skip to content

Commit 578c942

Browse files
authored
Address AWS cluster up quota issues (#1912)
1 parent ecbeb92 commit 578c942

File tree

4 files changed

+311
-3
lines changed

4 files changed

+311
-3
lines changed

pkg/lib/aws/ec2.go

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,3 +224,108 @@ func (c *Client) ListSupportedAvailabilityZones(instanceType string, instanceTyp
224224

225225
return strset.Intersection(zoneSets...), nil
226226
}
227+
228+
func (c *Client) ListElasticIPs() ([]string, error) {
229+
addresses, err := c.EC2().DescribeAddresses(&ec2.DescribeAddressesInput{})
230+
if err != nil {
231+
return nil, errors.WithStack(err)
232+
}
233+
234+
addressesList := []string{}
235+
if addresses != nil {
236+
for _, address := range addresses.Addresses {
237+
if address != nil && address.PublicIp != nil {
238+
addressesList = append(addressesList, *address.PublicIp)
239+
}
240+
}
241+
}
242+
243+
return addressesList, nil
244+
}
245+
246+
func (c *Client) ListInternetGateways() ([]string, error) {
247+
gatewaysList := []string{}
248+
err := c.EC2().DescribeInternetGatewaysPages(&ec2.DescribeInternetGatewaysInput{}, func(output *ec2.DescribeInternetGatewaysOutput, lastPage bool) bool {
249+
if output == nil {
250+
return false
251+
}
252+
for _, gateway := range output.InternetGateways {
253+
if gateway != nil && gateway.InternetGatewayId != nil {
254+
gatewaysList = append(gatewaysList, *gateway.InternetGatewayId)
255+
}
256+
}
257+
258+
return true
259+
})
260+
if err != nil {
261+
return nil, errors.WithStack(err)
262+
}
263+
264+
return gatewaysList, nil
265+
}
266+
267+
func (c *Client) DescribeNATGateways() ([]ec2.NatGateway, error) {
268+
var gateways []ec2.NatGateway
269+
err := c.EC2().DescribeNatGatewaysPages(&ec2.DescribeNatGatewaysInput{}, func(output *ec2.DescribeNatGatewaysOutput, lastPage bool) bool {
270+
if output == nil {
271+
return false
272+
}
273+
for _, gateway := range output.NatGateways {
274+
if gateway == nil {
275+
continue
276+
}
277+
gateways = append(gateways, *gateway)
278+
}
279+
280+
return true
281+
})
282+
if err != nil {
283+
return nil, errors.WithStack(err)
284+
}
285+
286+
return gateways, nil
287+
}
288+
289+
func (c *Client) DescribeSubnets() ([]ec2.Subnet, error) {
290+
var subnets []ec2.Subnet
291+
err := c.EC2().DescribeSubnetsPages(&ec2.DescribeSubnetsInput{}, func(output *ec2.DescribeSubnetsOutput, lastPage bool) bool {
292+
if output == nil {
293+
return false
294+
}
295+
for _, subnet := range output.Subnets {
296+
if subnet == nil {
297+
continue
298+
}
299+
subnets = append(subnets, *subnet)
300+
}
301+
302+
return true
303+
})
304+
if err != nil {
305+
return nil, errors.WithStack(err)
306+
}
307+
308+
return subnets, nil
309+
}
310+
311+
func (c *Client) DescribeVpcs() ([]ec2.Vpc, error) {
312+
var vpcs []ec2.Vpc
313+
err := c.EC2().DescribeVpcsPages(&ec2.DescribeVpcsInput{}, func(output *ec2.DescribeVpcsOutput, lastPage bool) bool {
314+
if output == nil {
315+
return false
316+
}
317+
for _, vpc := range output.Vpcs {
318+
if vpc == nil {
319+
continue
320+
}
321+
vpcs = append(vpcs, *vpc)
322+
}
323+
324+
return true
325+
})
326+
if err != nil {
327+
return nil, errors.WithStack(err)
328+
}
329+
330+
return vpcs, nil
331+
}

pkg/lib/aws/errors.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,19 @@ const (
4343
ErrDashboardHeightOutOfRange = "aws.dashboard_height_out_of_range"
4444
ErrRegionNotConfigured = "aws.region_not_configured"
4545
ErrUnableToFindCredentials = "aws.unable_to_find_credentials"
46+
ErrNATGatewayLimitExceeded = "aws.nat_gateway_limit_exceeded"
47+
ErrEIPLimitExceeded = "aws.eip_limit_exceeded"
48+
ErrInternetGatewayLimitExceeded = "aws.internet_gateway_limit_exceeded"
49+
ErrVPCLimitExceeded = "aws.vpc_limit_exceeded"
4650
)
4751

52+
func IsAWSError(err error) bool {
53+
if _, ok := errors.CauseOrSelf(err).(awserr.Error); ok {
54+
return true
55+
}
56+
return false
57+
}
58+
4859
func IsNotFoundErr(err error) bool {
4960
return IsErrCode(err, "NotFound")
5061
}
@@ -196,3 +207,35 @@ func ErrorUnableToFindCredentials() error {
196207
Message: "unable to find aws credentials; instructions about configuring aws credentials can be found at https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html",
197208
})
198209
}
210+
211+
func ErrorNATGatewayLimitExceeded(currentLimit, additionalQuotaRequired int, availabilityZones []string, region string) error {
212+
url := "https://console.aws.amazon.com/servicequotas/home?#!/services/vpc/quotas"
213+
return errors.WithStack(&errors.Error{
214+
Kind: ErrNATGatewayLimitExceeded,
215+
Message: fmt.Sprintf("NAT gateway limit of %d exceeded in availability zones %s of region %s; remove some of the existing NAT gateways or increase your quota for NAT gateways by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, s.StrsAnd(availabilityZones), region, additionalQuotaRequired, url),
216+
})
217+
}
218+
219+
func ErrorEIPLimitExceeded(currentLimit, additionalQuotaRequired int, region string) error {
220+
url := "https://console.aws.amazon.com/servicequotas/home?#!/services/ec2/quotas"
221+
return errors.WithStack(&errors.Error{
222+
Kind: ErrEIPLimitExceeded,
223+
Message: fmt.Sprintf("elastic IPs limit of %d exceeded in region %s; remove some of the existing elastic IPs or increase your quota for elastic IPs by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, region, additionalQuotaRequired, url),
224+
})
225+
}
226+
227+
func ErrorInternetGatewayLimitExceeded(currentLimit, additionalQuotaRequired int, region string) error {
228+
url := "https://console.aws.amazon.com/servicequotas/home?#!/services/vpc/quotas"
229+
return errors.WithStack(&errors.Error{
230+
Kind: ErrInternetGatewayLimitExceeded,
231+
Message: fmt.Sprintf("internet gateway limit of %d exceeded in region %s; remove some of the existing internet gateways or increase your quota for internet gateways by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, region, additionalQuotaRequired, url),
232+
})
233+
}
234+
235+
func ErrorVPCLimitExceeded(currentLimit, additionalQuotaRequired int, region string) error {
236+
url := "https://console.aws.amazon.com/servicequotas/home?#!/services/vpc/quotas"
237+
return errors.WithStack(&errors.Error{
238+
Kind: ErrVPCLimitExceeded,
239+
Message: fmt.Sprintf("VPC limit of %d exceeded in region %s; remove some of the existing VPCs or increase your quota for VPCs by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, region, additionalQuotaRequired, url),
240+
})
241+
}

pkg/lib/aws/servicequotas.go

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,13 @@ var _instanceCategoryRegex = regexp.MustCompile(`[a-zA-Z]+`)
3131
var _standardInstanceCategories = strset.New("a", "c", "d", "h", "i", "m", "r", "t", "z")
3232
var _knownInstanceCategories = strset.Union(_standardInstanceCategories, strset.New("p", "g", "inf", "x", "f"))
3333

34+
const (
35+
_elasticIPsQuotaCode = "L-0263D0A3"
36+
_internetGatewayQuotaCode = "L-A4707A72"
37+
_natGatewayQuotaCode = "L-FE5A380F"
38+
_vpcQuotaCode = "L-F678F1CE"
39+
)
40+
3441
func (c *Client) VerifyInstanceQuota(instanceType string, requiredOnDemandInstances int64, requiredSpotInstances int64) error {
3542
if requiredOnDemandInstances == 0 && requiredSpotInstances == 0 {
3643
return nil
@@ -103,3 +110,146 @@ func (c *Client) VerifyInstanceQuota(instanceType string, requiredOnDemandInstan
103110

104111
return nil
105112
}
113+
114+
func (c *Client) VerifyNetworkQuotas(requiredInternetGateways int, natGatewayRequired bool, highlyAvailableNATGateway bool, requiredVPCs int, availabilityZones strset.Set) error {
115+
quotaCodeToValueMap := map[string]int{
116+
_elasticIPsQuotaCode: 0, // elastic IP quota code
117+
_internetGatewayQuotaCode: 0, // internet gw quota code
118+
_natGatewayQuotaCode: 0, // nat gw quota code
119+
_vpcQuotaCode: 0, // vpc quota code
120+
}
121+
122+
err := c.ServiceQuotas().ListServiceQuotasPages(
123+
&servicequotas.ListServiceQuotasInput{
124+
ServiceCode: aws.String("ec2"),
125+
},
126+
func(page *servicequotas.ListServiceQuotasOutput, lastPage bool) bool {
127+
if page == nil {
128+
return false
129+
}
130+
for _, quota := range page.Quotas {
131+
if quota == nil || quota.QuotaCode == nil || quota.Value == nil {
132+
continue
133+
}
134+
if _, ok := quotaCodeToValueMap[*quota.QuotaCode]; ok {
135+
quotaCodeToValueMap[*quota.QuotaCode] = int(*quota.Value)
136+
return false
137+
}
138+
}
139+
return true
140+
},
141+
)
142+
if err != nil {
143+
return errors.WithStack(err)
144+
}
145+
146+
err = c.ServiceQuotas().ListServiceQuotasPages(
147+
&servicequotas.ListServiceQuotasInput{
148+
ServiceCode: aws.String("vpc"),
149+
},
150+
func(page *servicequotas.ListServiceQuotasOutput, lastPage bool) bool {
151+
if page == nil {
152+
return false
153+
}
154+
for _, quota := range page.Quotas {
155+
if quota == nil || quota.QuotaCode == nil || quota.Value == nil {
156+
continue
157+
}
158+
if _, ok := quotaCodeToValueMap[*quota.QuotaCode]; ok {
159+
quotaCodeToValueMap[*quota.QuotaCode] = int(*quota.Value)
160+
}
161+
}
162+
return true
163+
},
164+
)
165+
if err != nil {
166+
return errors.WithStack(err)
167+
}
168+
169+
// check internet GW quota
170+
if requiredInternetGateways > 0 {
171+
internetGatewaysInUse, err := c.ListInternetGateways()
172+
if err != nil {
173+
return err
174+
}
175+
if quotaCodeToValueMap[_internetGatewayQuotaCode]-len(internetGatewaysInUse)-requiredInternetGateways < 0 {
176+
additionalQuotaRequired := len(internetGatewaysInUse) + requiredInternetGateways - quotaCodeToValueMap[_internetGatewayQuotaCode]
177+
return ErrorInternetGatewayLimitExceeded(quotaCodeToValueMap[_internetGatewayQuotaCode], additionalQuotaRequired, c.Region)
178+
}
179+
}
180+
181+
if natGatewayRequired {
182+
// get NAT GW in use per selected AZ
183+
natGateways, err := c.DescribeNATGateways()
184+
if err != nil {
185+
return err
186+
}
187+
subnets, err := c.DescribeSubnets()
188+
if err != nil {
189+
return err
190+
}
191+
azToGatewaysInUse := map[string]int{}
192+
for _, natGateway := range natGateways {
193+
if natGateway.SubnetId == nil {
194+
continue
195+
}
196+
for _, subnet := range subnets {
197+
if subnet.SubnetId == nil || subnet.AvailabilityZone == nil {
198+
continue
199+
}
200+
if !availabilityZones.Has(*subnet.AvailabilityZone) {
201+
continue
202+
}
203+
if *subnet.SubnetId == *natGateway.SubnetId {
204+
azToGatewaysInUse[*subnet.AvailabilityZone]++
205+
}
206+
}
207+
}
208+
// check NAT GW quota
209+
numOfExhaustedNATGatewayAZs := 0
210+
azsWithQuotaDeficit := []string{}
211+
for az, numActiveGatewaysOnAZ := range azToGatewaysInUse {
212+
// -1 comes from the NAT gateway we require per AZ
213+
azDeficit := quotaCodeToValueMap[_natGatewayQuotaCode] - numActiveGatewaysOnAZ - 1
214+
if azDeficit < 0 {
215+
numOfExhaustedNATGatewayAZs++
216+
azsWithQuotaDeficit = append(azsWithQuotaDeficit, az)
217+
}
218+
}
219+
if (highlyAvailableNATGateway && numOfExhaustedNATGatewayAZs > 0) || (!highlyAvailableNATGateway && numOfExhaustedNATGatewayAZs == len(availabilityZones)) {
220+
return ErrorNATGatewayLimitExceeded(quotaCodeToValueMap[_natGatewayQuotaCode], 1, azsWithQuotaDeficit, c.Region)
221+
}
222+
}
223+
224+
// check EIP quota
225+
if natGatewayRequired {
226+
elasticIPsInUse, err := c.ListElasticIPs()
227+
if err != nil {
228+
return err
229+
}
230+
var requiredElasticIPs int
231+
if highlyAvailableNATGateway {
232+
requiredElasticIPs = len(availabilityZones)
233+
} else {
234+
requiredElasticIPs = 1
235+
}
236+
if quotaCodeToValueMap[_elasticIPsQuotaCode]-len(elasticIPsInUse)-requiredElasticIPs < 0 {
237+
additionalQuotaRequired := len(elasticIPsInUse) + requiredElasticIPs - quotaCodeToValueMap[_elasticIPsQuotaCode]
238+
return ErrorEIPLimitExceeded(quotaCodeToValueMap[_elasticIPsQuotaCode], additionalQuotaRequired, c.Region)
239+
}
240+
}
241+
242+
// check VPC quota
243+
if requiredVPCs > 0 {
244+
vpcs, err := c.DescribeVpcs()
245+
if err != nil {
246+
return err
247+
}
248+
if quotaCodeToValueMap[_vpcQuotaCode]-len(vpcs)-requiredVPCs < 0 {
249+
additionalQuotaRequired := len(vpcs) + requiredVPCs - quotaCodeToValueMap[_vpcQuotaCode]
250+
return ErrorVPCLimitExceeded(quotaCodeToValueMap[_vpcQuotaCode], additionalQuotaRequired, c.Region)
251+
}
252+
}
253+
254+
return nil
255+
}

pkg/types/clusterconfig/cluster_config_aws.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ import (
2525
"regexp"
2626
"strings"
2727

28-
"github.com/aws/aws-sdk-go/aws/awserr"
2928
"github.com/aws/aws-sdk-go/service/iam"
3029
"github.com/cortexlabs/cortex/pkg/consts"
3130
"github.com/cortexlabs/cortex/pkg/lib/aws"
@@ -766,8 +765,8 @@ func (cc *Config) Validate(awsClient *aws.Client) error {
766765

767766
if err := awsClient.VerifyInstanceQuota(primaryInstanceType, cc.MaxPossibleOnDemandInstances(), cc.MaxPossibleSpotInstances()); err != nil {
768767
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
769-
if _, ok := errors.CauseOrSelf(err).(awserr.Error); !ok {
770-
return errors.Wrap(err, InstanceTypeKey)
768+
if !aws.IsAWSError(err) {
769+
return err
771770
}
772771
}
773772

@@ -793,6 +792,17 @@ func (cc *Config) Validate(awsClient *aws.Client) error {
793792
}
794793
}
795794

795+
var requiredVPCs int
796+
if len(cc.Subnets) == 0 {
797+
requiredVPCs = 1
798+
}
799+
if err := awsClient.VerifyNetworkQuotas(1, cc.NATGateway != NoneNATGateway, cc.NATGateway == HighlyAvailableNATGateway, requiredVPCs, strset.FromSlice(cc.AvailabilityZones)); err != nil {
800+
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
801+
if !aws.IsAWSError(err) {
802+
return err
803+
}
804+
}
805+
796806
if cc.Spot != nil && *cc.Spot {
797807
cc.FillEmptySpotFields()
798808

0 commit comments

Comments
 (0)