Skip to content

Commit ab651d8

Browse files
authored
Validate # of security groups and in/out rules before clustering up (#2127)
1 parent b5e4753 commit ab651d8

File tree

5 files changed

+128
-27
lines changed

5 files changed

+128
-27
lines changed

cli/cmd/lib_cluster_config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ func getInstallClusterConfig(awsClient *aws.Client, clusterConfigFile string, di
135135
return nil, err
136136
}
137137

138-
err = clusterConfig.Validate(awsClient, false)
138+
err = clusterConfig.Validate(awsClient)
139139
if err != nil {
140140
err = errors.Append(err, fmt.Sprintf("\n\ncluster configuration schema can be found at https://docs.cortex.dev/v/%s/", consts.CortexVersionMinor))
141141
return nil, errors.Wrap(err, clusterConfigFile)

pkg/lib/aws/ec2.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,28 @@ func (c *Client) DescribeVpcs() ([]ec2.Vpc, error) {
334334
return vpcs, nil
335335
}
336336

337+
func (c *Client) DescribeSecurityGroups() ([]ec2.SecurityGroup, error) {
338+
var sgs []ec2.SecurityGroup
339+
err := c.EC2().DescribeSecurityGroupsPages(&ec2.DescribeSecurityGroupsInput{}, func(output *ec2.DescribeSecurityGroupsOutput, lastPage bool) bool {
340+
if output == nil {
341+
return false
342+
}
343+
for _, sg := range output.SecurityGroups {
344+
if sg == nil {
345+
continue
346+
}
347+
sgs = append(sgs, *sg)
348+
}
349+
350+
return true
351+
})
352+
if err != nil {
353+
return nil, errors.WithStack(err)
354+
}
355+
356+
return sgs, nil
357+
}
358+
337359
func (c *Client) ListVolumes(tags ...ec2.Tag) ([]ec2.Volume, error) {
338360
var volumes []ec2.Volume
339361
err := c.EC2().DescribeVolumesPages(&ec2.DescribeVolumesInput{}, func(output *ec2.DescribeVolumesOutput, lastPage bool) bool {

pkg/lib/aws/errors.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ const (
4646
ErrEIPLimitExceeded = "aws.eip_limit_exceeded"
4747
ErrInternetGatewayLimitExceeded = "aws.internet_gateway_limit_exceeded"
4848
ErrVPCLimitExceeded = "aws.vpc_limit_exceeded"
49+
ErrSecurityGroupRulesExceeded = "aws.security_group_rules_exceeded"
50+
ErrSecurityGroupLimitExceeded = "aws.security_group_limit_exceeded"
4951
)
5052

5153
func IsAWSError(err error) bool {
@@ -232,3 +234,19 @@ func ErrorVPCLimitExceeded(currentLimit, additionalQuotaRequired int, region str
232234
Message: fmt.Sprintf("VPC limit of %d exceeded in region %s; remove some of the existing VPCs or increase your quota for VPCs by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, region, additionalQuotaRequired, url),
233235
})
234236
}
237+
238+
func ErrorSecurityGroupRulesExceeded(currentLimit, additionalQuotaRequired int, region string) error {
239+
url := "https://console.aws.amazon.com/servicequotas/home?#!/services/vpc/quotas"
240+
return errors.WithStack(&errors.Error{
241+
Kind: ErrSecurityGroupRulesExceeded,
242+
Message: fmt.Sprintf("security group rules limit of %d exceeded in region %s; use fewer availability zones, remove some node groups from your cluster config, reduce the number of CIDR white lists (if you have any), or increase your quota for inbound/outbound rules per security group by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, region, additionalQuotaRequired, url),
243+
})
244+
}
245+
246+
func ErrorSecurityGroupLimitExceeded(currentLimit, additionalQuotaRequired int, region string) error {
247+
url := "https://console.aws.amazon.com/servicequotas/home?#!/services/vpc/quotas"
248+
return errors.WithStack(&errors.Error{
249+
Kind: ErrSecurityGroupLimitExceeded,
250+
Message: fmt.Sprintf("security group limit of %d exceeded in region %s; remove some node groups from your cluster config or increase your quota for security groups by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, region, additionalQuotaRequired, url),
251+
})
252+
}

pkg/lib/aws/servicequotas.go

Lines changed: 73 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,18 @@ var _standardInstanceCategories = strset.New("a", "c", "d", "h", "i", "m", "r",
3232
var _knownInstanceCategories = strset.Union(_standardInstanceCategories, strset.New("p", "g", "inf", "x", "f"))
3333

3434
const (
35-
_elasticIPsQuotaCode = "L-0263D0A3"
36-
_internetGatewayQuotaCode = "L-A4707A72"
37-
_natGatewayQuotaCode = "L-FE5A380F"
38-
_vpcQuotaCode = "L-F678F1CE"
35+
_elasticIPsQuotaCode = "L-0263D0A3"
36+
_internetGatewayQuotaCode = "L-A4707A72"
37+
_natGatewayQuotaCode = "L-FE5A380F"
38+
_vpcQuotaCode = "L-F678F1CE"
39+
_securityGroupsQuotaCode = "L-E79EC296"
40+
_securityGroupRulesQuotaCode = "L-0EA8095F"
41+
42+
// 11 inbound rules
43+
_baseInboundRulesForNodeGroup = 11
44+
_inboundRulesPerAZ = 8
45+
// ClusterSharedNodeSecurityGroup, ControlPlaneSecurityGroup, eks-cluster-sg-<cluster-name>, and operator security group
46+
_baseNumberOfSecurityGroups = 4
3947
)
4048

4149
type InstanceTypeRequests struct {
@@ -145,12 +153,21 @@ func (c *Client) VerifyInstanceQuota(instances []InstanceTypeRequests) error {
145153
return nil
146154
}
147155

148-
func (c *Client) VerifyNetworkQuotas(requiredInternetGateways int, natGatewayRequired bool, highlyAvailableNATGateway bool, requiredVPCs int, availabilityZones strset.Set) error {
156+
func (c *Client) VerifyNetworkQuotas(
157+
requiredInternetGateways int,
158+
natGatewayRequired bool,
159+
highlyAvailableNATGateway bool,
160+
requiredVPCs int,
161+
availabilityZones strset.Set,
162+
numNodeGroups int,
163+
longestCIDRWhiteList int) error {
149164
quotaCodeToValueMap := map[string]int{
150-
_elasticIPsQuotaCode: 0, // elastic IP quota code
151-
_internetGatewayQuotaCode: 0, // internet gw quota code
152-
_natGatewayQuotaCode: 0, // nat gw quota code
153-
_vpcQuotaCode: 0, // vpc quota code
165+
_elasticIPsQuotaCode: 0, // elastic IP quota code
166+
_internetGatewayQuotaCode: 0, // internet gw quota code
167+
_natGatewayQuotaCode: 0, // nat gw quota code
168+
_vpcQuotaCode: 0, // vpc quota code
169+
_securityGroupsQuotaCode: 0, // security groups quota code
170+
_securityGroupRulesQuotaCode: 0, // security group rules quota code
154171
}
155172

156173
err := c.ServiceQuotas().ListServiceQuotasPages(
@@ -285,5 +302,52 @@ func (c *Client) VerifyNetworkQuotas(requiredInternetGateways int, natGatewayReq
285302
}
286303
}
287304

305+
// check rules quota for nodegroup SGs
306+
requiredRulesForSG := requiredRulesForNodeGroupSecurityGroup(len(availabilityZones), longestCIDRWhiteList)
307+
if requiredRulesForSG > quotaCodeToValueMap[_securityGroupRulesQuotaCode] {
308+
additionalQuotaRequired := requiredRulesForSG - quotaCodeToValueMap[_securityGroupRulesQuotaCode]
309+
return ErrorSecurityGroupRulesExceeded(quotaCodeToValueMap[_securityGroupRulesQuotaCode], additionalQuotaRequired, c.Region)
310+
}
311+
312+
// check rules quota for control plane SG
313+
requiredRulesForCPSG := requiredRulesForControlPlaneSecurityGroup(numNodeGroups)
314+
if requiredRulesForCPSG > quotaCodeToValueMap[_securityGroupRulesQuotaCode] {
315+
additionalQuotaRequired := requiredRulesForCPSG - quotaCodeToValueMap[_securityGroupRulesQuotaCode]
316+
return ErrorSecurityGroupRulesExceeded(quotaCodeToValueMap[_securityGroupRulesQuotaCode], additionalQuotaRequired, c.Region)
317+
}
318+
319+
// check security groups quota
320+
requiredSecurityGroups := requiredSecurityGroups(numNodeGroups)
321+
sgs, err := c.DescribeSecurityGroups()
322+
if err != nil {
323+
return err
324+
}
325+
if quotaCodeToValueMap[_securityGroupsQuotaCode]-len(sgs)-requiredSecurityGroups < 0 {
326+
additionalQuotaRequired := len(sgs) + requiredSecurityGroups - quotaCodeToValueMap[_securityGroupsQuotaCode]
327+
return ErrorSecurityGroupLimitExceeded(quotaCodeToValueMap[_securityGroupsQuotaCode], additionalQuotaRequired, c.Region)
328+
329+
}
330+
288331
return nil
289332
}
333+
334+
func requiredRulesForNodeGroupSecurityGroup(numAZs, whitelistLength int) int {
335+
whitelistRuleCount := 0
336+
if whitelistLength == 1 {
337+
whitelistRuleCount = 1
338+
} else if whitelistLength > 1 {
339+
whitelistRuleCount = 1 + 5*(whitelistLength-1)
340+
}
341+
return _baseInboundRulesForNodeGroup + numAZs*_inboundRulesPerAZ + whitelistRuleCount
342+
}
343+
344+
func requiredRulesForControlPlaneSecurityGroup(numNodeGroups int) int {
345+
// +1 for the operator node group
346+
// this is the number of outbound rules (there are half as many inbound rules, so that is not the limiting factor)
347+
return 2 * (numNodeGroups + 1)
348+
}
349+
350+
func requiredSecurityGroups(numNodeGroups int) int {
351+
// each node group requires a security group
352+
return _baseNumberOfSecurityGroups + numNodeGroups
353+
}

pkg/types/clusterconfig/cluster_config.go

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -781,7 +781,7 @@ func (cc *CoreConfig) SQSNamePrefix() string {
781781
}
782782

783783
// this validates the user-provided cluster config
784-
func (cc *Config) Validate(awsClient *aws.Client, skipQuotaVerification bool) error {
784+
func (cc *Config) Validate(awsClient *aws.Client) error {
785785
fmt.Print("verifying your configuration ...\n\n")
786786

787787
numNodeGroups := len(cc.NodeGroups)
@@ -817,12 +817,10 @@ func (cc *Config) Validate(awsClient *aws.Client, skipQuotaVerification bool) er
817817
})
818818
}
819819

820-
if !skipQuotaVerification {
821-
if err := awsClient.VerifyInstanceQuota(instances); err != nil {
822-
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
823-
if !aws.IsAWSError(err) {
824-
return errors.Wrap(err, NodeGroupsKey)
825-
}
820+
if err := awsClient.VerifyInstanceQuota(instances); err != nil {
821+
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
822+
if !aws.IsAWSError(err) {
823+
return errors.Wrap(err, NodeGroupsKey)
826824
}
827825
}
828826

@@ -909,16 +907,15 @@ func (cc *Config) Validate(awsClient *aws.Client, skipQuotaVerification bool) er
909907
}
910908
}
911909

912-
if !skipQuotaVerification {
913-
var requiredVPCs int
914-
if len(cc.Subnets) == 0 {
915-
requiredVPCs = 1
916-
}
917-
if err := awsClient.VerifyNetworkQuotas(1, cc.NATGateway != NoneNATGateway, cc.NATGateway == HighlyAvailableNATGateway, requiredVPCs, strset.FromSlice(cc.AvailabilityZones)); err != nil {
918-
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
919-
if !aws.IsAWSError(err) {
920-
return err
921-
}
910+
var requiredVPCs int
911+
if len(cc.Subnets) == 0 {
912+
requiredVPCs = 1
913+
}
914+
longestCIDRWhiteList := libmath.MaxInt(len(cc.APILoadBalancerCIDRWhiteList), len(cc.OperatorLoadBalancerCIDRWhiteList))
915+
if err := awsClient.VerifyNetworkQuotas(1, cc.NATGateway != NoneNATGateway, cc.NATGateway == HighlyAvailableNATGateway, requiredVPCs, strset.FromSlice(cc.AvailabilityZones), len(cc.NodeGroups), longestCIDRWhiteList); err != nil {
916+
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
917+
if !aws.IsAWSError(err) {
918+
return err
922919
}
923920
}
924921

0 commit comments

Comments
 (0)