Skip to content

Commit

Permalink
update: create nodegroup adds taint for GPU and Neuron nodes (#117)
Browse files Browse the repository at this point in the history
  • Loading branch information
aaroniscode authored Oct 8, 2023
1 parent e672ac6 commit e40e012
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 14 deletions.
6 changes: 6 additions & 0 deletions pkg/resource/nodegroup/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,10 @@ managedNodeGroups:
{{- end }}
privateNetworking: true
spot: {{ .Spot }}
{{- range .Taints }}
taints:
- key: {{ .Key }}
value: {{ .Value | printf "%q" }}
effect: {{ .Effect }}
{{- end }}
`
64 changes: 50 additions & 14 deletions pkg/resource/nodegroup/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,24 @@ type NodegroupOptions struct {
MinSize int
MaxSize int
NodegroupName string
NoTaints bool
OperatingSystem string
Spot bool
SpotvCPUs int
SpotMemory int
Taints []Taint

UpdateDesired int
UpdateMin int
UpdateMax int
}

type Taint struct {
Key string
Value string
Effect string
}

func NewOptions() (options *NodegroupOptions, createFlags, updateFlags cmd.Flags) {
options = &NodegroupOptions{
CommonOptions: &resource.CommonOptions{},
Expand Down Expand Up @@ -102,6 +110,13 @@ func NewOptions() (options *NodegroupOptions, createFlags, updateFlags cmd.Flags
},
Option: &options.DesiredCapacity,
},
&cmd.BoolFlag{
CommandFlag: cmd.CommandFlag{
Name: "no-taints",
Description: "don't taint nodes with GPUs or Neuron cores",
},
Option: &options.NoTaints,
},
&cmd.StringFlag{
CommandFlag: cmd.CommandFlag{
Name: "os",
Expand Down Expand Up @@ -156,9 +171,9 @@ func NewOptions() (options *NodegroupOptions, createFlags, updateFlags cmd.Flags
}

func (o *NodegroupOptions) PreCreate() error {
filter := []types.Filter{aws.NewEC2InstanceTypeFilter(o.InstanceType)}

instanceTypes, err := aws.NewEC2Client().DescribeInstanceTypes(filter)
instanceTypes, err := aws.NewEC2Client().DescribeInstanceTypes(
[]types.Filter{aws.NewEC2InstanceTypeFilter(o.InstanceType)},
)
if err != nil {
return fmt.Errorf("failed to describe instance types: %w", err)
}
Expand All @@ -167,35 +182,56 @@ func (o *NodegroupOptions) PreCreate() error {
return fmt.Errorf("%q is not a valid instance type in region %q", o.InstanceType, o.Region)
}

var isGraviton, isNeuron, isNvidia bool
instType := strings.Split(o.InstanceType, ".")[0]

switch {
case strings.HasPrefix(instType, "g"),
strings.HasPrefix(instType, "p"):

isNvidia = true

case strings.HasPrefix(instType, "inf"),
strings.HasPrefix(instType, "trn"):

isNeuron = true

case strings.HasSuffix(instType, "g"),
strings.HasSuffix(instType, "gd"),
strings.HasSuffix(instType, "gn"),
strings.HasSuffix(instType, "gen"):

isGraviton = true
}

if isNeuron && !o.NoTaints {
o.Taints = append(o.Taints, Taint{Key: "aws.amazon.com/neuron", Effect: "NoSchedule"})
}

if isNvidia && !o.NoTaints {
o.Taints = append(o.Taints, Taint{Key: "nvidia.com/gpu", Effect: "NoSchedule"})
}

// AMI Lookup is currently only for Amazon Linux 2 EKS Optimized AMI
if o.OperatingSystem != "AmazonLinux2" {
return nil
}

instType := strings.Split(o.InstanceType, ".")[0]
ssmClient := aws.NewSSMClient()

switch {
case instType == "g5g":
return fmt.Errorf("%q instance type is not supported with the EKS optimized Amazon Linux AMI", "G5g")

case strings.HasPrefix(instType, "g"),
strings.HasPrefix(instType, "p"),
strings.HasPrefix(instType, "inf"),
strings.HasPrefix(instType, "trn"):

case isNeuron, isNvidia:
param, err := ssmClient.GetParameter(fmt.Sprintf(eksOptmizedGpuAmiPath, o.KubernetesVersion))
if err != nil {
return fmt.Errorf("failed to lookup EKS optimized accelerated AMI for instance type %s: %w", o.InstanceType, err)
}

o.AMI = awssdk.ToString(param.Value)

case strings.HasSuffix(instType, "g"),
strings.HasSuffix(instType, "gd"),
strings.HasSuffix(instType, "gn"),
strings.HasSuffix(instType, "gen"):

case isGraviton:
param, err := ssmClient.GetParameter(fmt.Sprintf(eksOptmizedArmAmiPath, o.KubernetesVersion))
if err != nil {
return fmt.Errorf("failed to lookup EKS optimized ARM AMI for instance type %s: %w", o.InstanceType, err)
Expand Down

0 comments on commit e40e012

Please sign in to comment.