Skip to content

Validate # of security groups and in/out rules before clustering up #2127

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 28, 2021
2 changes: 1 addition & 1 deletion cli/cmd/lib_cluster_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ func getInstallClusterConfig(awsClient *aws.Client, clusterConfigFile string, di
return nil, err
}

err = clusterConfig.Validate(awsClient, false)
err = clusterConfig.Validate(awsClient)
if err != nil {
err = errors.Append(err, fmt.Sprintf("\n\ncluster configuration schema can be found at https://docs.cortex.dev/v/%s/", consts.CortexVersionMinor))
return nil, errors.Wrap(err, clusterConfigFile)
Expand Down
22 changes: 22 additions & 0 deletions pkg/lib/aws/ec2.go
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,28 @@ func (c *Client) DescribeVpcs() ([]ec2.Vpc, error) {
return vpcs, nil
}

func (c *Client) DescribeSecurityGroups() ([]ec2.SecurityGroup, error) {
var sgs []ec2.SecurityGroup
err := c.EC2().DescribeSecurityGroupsPages(&ec2.DescribeSecurityGroupsInput{}, func(output *ec2.DescribeSecurityGroupsOutput, lastPage bool) bool {
if output == nil {
return false
}
for _, sg := range output.SecurityGroups {
if sg == nil {
continue
}
sgs = append(sgs, *sg)
}

return true
})
if err != nil {
return nil, errors.WithStack(err)
}

return sgs, nil
}

func (c *Client) ListVolumes(tags ...ec2.Tag) ([]ec2.Volume, error) {
var volumes []ec2.Volume
err := c.EC2().DescribeVolumesPages(&ec2.DescribeVolumesInput{}, func(output *ec2.DescribeVolumesOutput, lastPage bool) bool {
Expand Down
18 changes: 18 additions & 0 deletions pkg/lib/aws/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ const (
ErrEIPLimitExceeded = "aws.eip_limit_exceeded"
ErrInternetGatewayLimitExceeded = "aws.internet_gateway_limit_exceeded"
ErrVPCLimitExceeded = "aws.vpc_limit_exceeded"
ErrSecurityGroupRulesExceeded = "aws.security_group_rules_exceeded"
ErrSecurityGroupLimitExceeded = "aws.security_group_limit_exceeded"
)

func IsAWSError(err error) bool {
Expand Down Expand Up @@ -232,3 +234,19 @@ func ErrorVPCLimitExceeded(currentLimit, additionalQuotaRequired int, region str
Message: fmt.Sprintf("VPC limit of %d exceeded in region %s; remove some of the existing VPCs or increase your quota for VPCs by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, region, additionalQuotaRequired, url),
})
}

func ErrorSecurityGroupRulesExceeded(currentLimit, additionalQuotaRequired int, region string) error {
url := "https://console.aws.amazon.com/servicequotas/home?#!/services/vpc/quotas"
return errors.WithStack(&errors.Error{
Kind: ErrSecurityGroupRulesExceeded,
Message: fmt.Sprintf("security group rules limit of %d exceeded in region %s; use fewer availability zones, remove some node groups from your cluster config, reduce the number of CIDR white lists (if you have any), or increase your quota for inbound/outbound rules per security group by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, region, additionalQuotaRequired, url),
})
}

func ErrorSecurityGroupLimitExceeded(currentLimit, additionalQuotaRequired int, region string) error {
url := "https://console.aws.amazon.com/servicequotas/home?#!/services/vpc/quotas"
return errors.WithStack(&errors.Error{
Kind: ErrSecurityGroupLimitExceeded,
Message: fmt.Sprintf("security group limit of %d exceeded in region %s; remove some node groups from your cluster config or increase your quota for security groups by at least %d here: %s (if your request was recently approved, please allow ~30 minutes for AWS to reflect this change)", currentLimit, region, additionalQuotaRequired, url),
})
}
82 changes: 73 additions & 9 deletions pkg/lib/aws/servicequotas.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,18 @@ var _standardInstanceCategories = strset.New("a", "c", "d", "h", "i", "m", "r",
var _knownInstanceCategories = strset.Union(_standardInstanceCategories, strset.New("p", "g", "inf", "x", "f"))

const (
_elasticIPsQuotaCode = "L-0263D0A3"
_internetGatewayQuotaCode = "L-A4707A72"
_natGatewayQuotaCode = "L-FE5A380F"
_vpcQuotaCode = "L-F678F1CE"
_elasticIPsQuotaCode = "L-0263D0A3"
_internetGatewayQuotaCode = "L-A4707A72"
_natGatewayQuotaCode = "L-FE5A380F"
_vpcQuotaCode = "L-F678F1CE"
_securityGroupsQuotaCode = "L-E79EC296"
_securityGroupRulesQuotaCode = "L-0EA8095F"

// 11 inbound rules
_baseInboundRulesForNodeGroup = 11
_inboundRulesPerAZ = 8
// ClusterSharedNodeSecurityGroup, ControlPlaneSecurityGroup, eks-cluster-sg-<cluster-name>, and operator security group
_baseNumberOfSecurityGroups = 4
)

type InstanceTypeRequests struct {
Expand Down Expand Up @@ -145,12 +153,21 @@ func (c *Client) VerifyInstanceQuota(instances []InstanceTypeRequests) error {
return nil
}

func (c *Client) VerifyNetworkQuotas(requiredInternetGateways int, natGatewayRequired bool, highlyAvailableNATGateway bool, requiredVPCs int, availabilityZones strset.Set) error {
func (c *Client) VerifyNetworkQuotas(
requiredInternetGateways int,
natGatewayRequired bool,
highlyAvailableNATGateway bool,
requiredVPCs int,
availabilityZones strset.Set,
numNodeGroups int,
longestCIDRWhiteList int) error {
quotaCodeToValueMap := map[string]int{
_elasticIPsQuotaCode: 0, // elastic IP quota code
_internetGatewayQuotaCode: 0, // internet gw quota code
_natGatewayQuotaCode: 0, // nat gw quota code
_vpcQuotaCode: 0, // vpc quota code
_elasticIPsQuotaCode: 0, // elastic IP quota code
_internetGatewayQuotaCode: 0, // internet gw quota code
_natGatewayQuotaCode: 0, // nat gw quota code
_vpcQuotaCode: 0, // vpc quota code
_securityGroupsQuotaCode: 0, // security groups quota code
_securityGroupRulesQuotaCode: 0, // security group rules quota code
}

err := c.ServiceQuotas().ListServiceQuotasPages(
Expand Down Expand Up @@ -285,5 +302,52 @@ func (c *Client) VerifyNetworkQuotas(requiredInternetGateways int, natGatewayReq
}
}

// check rules quota for nodegroup SGs
requiredRulesForSG := requiredRulesForNodeGroupSecurityGroup(len(availabilityZones), longestCIDRWhiteList)
if requiredRulesForSG > quotaCodeToValueMap[_securityGroupRulesQuotaCode] {
additionalQuotaRequired := requiredRulesForSG - quotaCodeToValueMap[_securityGroupRulesQuotaCode]
return ErrorSecurityGroupRulesExceeded(quotaCodeToValueMap[_securityGroupRulesQuotaCode], additionalQuotaRequired, c.Region)
}

// check rules quota for control plane SG
requiredRulesForCPSG := requiredRulesForControlPlaneSecurityGroup(numNodeGroups)
if requiredRulesForCPSG > quotaCodeToValueMap[_securityGroupRulesQuotaCode] {
additionalQuotaRequired := requiredRulesForCPSG - quotaCodeToValueMap[_securityGroupRulesQuotaCode]
return ErrorSecurityGroupRulesExceeded(quotaCodeToValueMap[_securityGroupRulesQuotaCode], additionalQuotaRequired, c.Region)
}

// check security groups quota
requiredSecurityGroups := requiredSecurityGroups(numNodeGroups)
sgs, err := c.DescribeSecurityGroups()
if err != nil {
return err
}
if quotaCodeToValueMap[_securityGroupsQuotaCode]-len(sgs)-requiredSecurityGroups < 0 {
additionalQuotaRequired := len(sgs) + requiredSecurityGroups - quotaCodeToValueMap[_securityGroupsQuotaCode]
return ErrorSecurityGroupLimitExceeded(quotaCodeToValueMap[_securityGroupsQuotaCode], additionalQuotaRequired, c.Region)

}

return nil
}

func requiredRulesForNodeGroupSecurityGroup(numAZs, whitelistLength int) int {
whitelistRuleCount := 0
if whitelistLength == 1 {
whitelistRuleCount = 1
} else if whitelistLength > 1 {
whitelistRuleCount = 1 + 5*(whitelistLength-1)
}
return _baseInboundRulesForNodeGroup + numAZs*_inboundRulesPerAZ + whitelistRuleCount
}

func requiredRulesForControlPlaneSecurityGroup(numNodeGroups int) int {
// +1 for the operator node group
// this is the number of outbound rules (there are half as many inbound rules, so that is not the limiting factor)
return 2 * (numNodeGroups + 1)
}

func requiredSecurityGroups(numNodeGroups int) int {
// each node group requires a security group
return _baseNumberOfSecurityGroups + numNodeGroups
}
31 changes: 14 additions & 17 deletions pkg/types/clusterconfig/cluster_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -781,7 +781,7 @@ func (cc *CoreConfig) SQSNamePrefix() string {
}

// this validates the user-provided cluster config
func (cc *Config) Validate(awsClient *aws.Client, skipQuotaVerification bool) error {
func (cc *Config) Validate(awsClient *aws.Client) error {
fmt.Print("verifying your configuration ...\n\n")

numNodeGroups := len(cc.NodeGroups)
Expand Down Expand Up @@ -817,12 +817,10 @@ func (cc *Config) Validate(awsClient *aws.Client, skipQuotaVerification bool) er
})
}

if !skipQuotaVerification {
if err := awsClient.VerifyInstanceQuota(instances); err != nil {
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
if !aws.IsAWSError(err) {
return errors.Wrap(err, NodeGroupsKey)
}
if err := awsClient.VerifyInstanceQuota(instances); err != nil {
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
if !aws.IsAWSError(err) {
return errors.Wrap(err, NodeGroupsKey)
}
}

Expand Down Expand Up @@ -909,16 +907,15 @@ func (cc *Config) Validate(awsClient *aws.Client, skipQuotaVerification bool) er
}
}

if !skipQuotaVerification {
var requiredVPCs int
if len(cc.Subnets) == 0 {
requiredVPCs = 1
}
if err := awsClient.VerifyNetworkQuotas(1, cc.NATGateway != NoneNATGateway, cc.NATGateway == HighlyAvailableNATGateway, requiredVPCs, strset.FromSlice(cc.AvailabilityZones)); err != nil {
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
if !aws.IsAWSError(err) {
return err
}
var requiredVPCs int
if len(cc.Subnets) == 0 {
requiredVPCs = 1
}
longestCIDRWhiteList := libmath.MaxInt(len(cc.APILoadBalancerCIDRWhiteList), len(cc.OperatorLoadBalancerCIDRWhiteList))
if err := awsClient.VerifyNetworkQuotas(1, cc.NATGateway != NoneNATGateway, cc.NATGateway == HighlyAvailableNATGateway, requiredVPCs, strset.FromSlice(cc.AvailabilityZones), len(cc.NodeGroups), longestCIDRWhiteList); err != nil {
// Skip AWS errors, since some regions (e.g. eu-north-1) do not support this API
if !aws.IsAWSError(err) {
return err
}
}

Expand Down