Skip to content

Commit

Permalink
chore(eks): support Inf1 instances (aws#8379)
Browse files Browse the repository at this point in the history
chore(eks): support inf1 instances

Amazon EKS supports Inferentia Instances today(see [Infrerentia Support](https://docs.aws.amazon.com/eks/latest/userguide/inferentia-support.html) and the doc [update](https://twitter.com/aws_doc/status/1268567994394968066)). This PR allows to create Amazon EKS worker nodes with `Inf1` instances. 

- Add `Inf1` instances support

----

*By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license*
  • Loading branch information
pahud authored Jun 28, 2020
1 parent 14e5063 commit 3a6353e
Show file tree
Hide file tree
Showing 7 changed files with 597 additions and 197 deletions.
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
"@aws-cdk/aws-codepipeline-actions/case/**",
"@aws-cdk/aws-ecr-assets/minimatch",
"@aws-cdk/aws-ecr-assets/minimatch/**",
"@aws-cdk/aws-eks/yaml",
"@aws-cdk/aws-eks/yaml/**",
"@aws-cdk/aws-lambda-nodejs/parcel-bundler",
"@aws-cdk/aws-lambda-nodejs/parcel-bundler/**",
"@aws-cdk/cloud-assembly-schema/jsonschema",
Expand Down
66 changes: 66 additions & 0 deletions packages/@aws-cdk/aws-eks/lib/addons/neuron-device-plugin.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# source: https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/k8s-neuron-device-plugin.yml
# https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: neuron-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: neuron-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: neuron-device-plugin-ds
spec:
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: aws.amazon.com/neuron
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "beta.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.4xlarge
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
containers:
- image: 790709498068.dkr.ecr.us-west-2.amazonaws.com/neuron-device-plugin:1.0.9043.0
imagePullPolicy: Always
name: k8s-neuron-device-plugin-ctr
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
34 changes: 33 additions & 1 deletion packages/@aws-cdk/aws-eks/lib/cluster.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ import * as ec2 from '@aws-cdk/aws-ec2';
import * as iam from '@aws-cdk/aws-iam';
import * as ssm from '@aws-cdk/aws-ssm';
import { CfnOutput, Construct, IResource, Resource, Stack, Tag, Token } from '@aws-cdk/core';
import * as fs from 'fs';
import * as path from 'path';
import * as YAML from 'yaml';
import { AwsAuth } from './aws-auth';
import { clusterArnComponents, ClusterResource } from './cluster-resource';
import { CfnCluster, CfnClusterProps } from './eks.generated';
Expand Down Expand Up @@ -385,6 +388,8 @@ export class Cluster extends Resource implements ICluster {

private _spotInterruptHandler?: HelmChart;

private _neuronDevicePlugin?: KubernetesResource;

private readonly version: string | undefined;

/**
Expand Down Expand Up @@ -537,6 +542,10 @@ export class Cluster extends Resource implements ICluster {
machineImageType: options.machineImageType,
});

if (nodeTypeForInstanceType(options.instanceType) === NodeType.INFERENTIA) {
this.addNeuronDevicePlugin();
}

return asg;
}

Expand Down Expand Up @@ -834,6 +843,20 @@ export class Cluster extends Resource implements ICluster {
return this._spotInterruptHandler;
}

/**
* Installs the Neuron device plugin on the cluster if it's not
* already added.
*/
private addNeuronDevicePlugin() {
if (!this._neuronDevicePlugin) {
const fileContents = fs.readFileSync(path.join(__dirname, 'addons/neuron-device-plugin.yaml'), 'utf8');
const sanitized = YAML.parse(fileContents);
this._neuronDevicePlugin = this.addResource('NeuronDevicePlugin', sanitized);
}

return this._neuronDevicePlugin;
}

/**
* Opportunistically tag subnets with the required tags.
*
Expand Down Expand Up @@ -1112,6 +1135,7 @@ export class EksOptimizedImage implements ec2.IMachineImage {
this.amiParameterName = `/aws/service/eks/optimized-ami/${this.kubernetesVersion}/`
+ ( this.nodeType === NodeType.STANDARD ? 'amazon-linux-2/' : '' )
+ ( this.nodeType === NodeType.GPU ? 'amazon-linux-2-gpu/' : '' )
+ (this.nodeType === NodeType.INFERENTIA ? 'amazon-linux-2-gpu/' : '')
+ 'recommended/image_id';
}

Expand Down Expand Up @@ -1176,6 +1200,11 @@ export enum NodeType {
* GPU instances
*/
GPU = 'GPU',

/**
* Inferentia instances
*/
INFERENTIA = 'INFERENTIA',
}

/**
Expand Down Expand Up @@ -1222,7 +1251,10 @@ export enum MachineImageType {
}

const GPU_INSTANCETYPES = ['p2', 'p3', 'g4'];
const INFERENTIA_INSTANCETYPES = ['inf1'];

function nodeTypeForInstanceType(instanceType: ec2.InstanceType) {
return GPU_INSTANCETYPES.includes(instanceType.toString().substring(0, 2)) ? NodeType.GPU : NodeType.STANDARD;
return GPU_INSTANCETYPES.includes(instanceType.toString().substring(0, 2)) ? NodeType.GPU :
INFERENTIA_INSTANCETYPES.includes(instanceType.toString().substring(0, 4)) ? NodeType.INFERENTIA :
NodeType.STANDARD;
}
7 changes: 6 additions & 1 deletion packages/@aws-cdk/aws-eks/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
"devDependencies": {
"@aws-cdk/assert": "0.0.0",
"@types/nodeunit": "^0.0.31",
"@types/yaml": "1.2.0",
"aws-sdk": "^2.706.0",
"cdk-build-tools": "0.0.0",
"cdk-integ-tools": "0.0.0",
Expand All @@ -80,8 +81,12 @@
"@aws-cdk/aws-ssm": "0.0.0",
"@aws-cdk/core": "0.0.0",
"@aws-cdk/custom-resources": "0.0.0",
"constructs": "^3.0.2"
"constructs": "^3.0.2",
"yaml": "1.10.0"
},
"bundledDependencies": [
"yaml"
],
"homepage": "https://github.com/aws/aws-cdk",
"peerDependencies": {
"@aws-cdk/aws-autoscaling": "0.0.0",
Expand Down
Loading

0 comments on commit 3a6353e

Please sign in to comment.