@@ -2,22 +2,23 @@ package addons
2
2
3
3
import (
4
4
"context"
5
+ // For go:embed
6
+ _ "embed"
5
7
"fmt"
6
8
"time"
7
9
8
10
"github.com/kris-nova/logger"
9
11
"github.com/pkg/errors"
12
+
10
13
api "github.com/weaveworks/eksctl/pkg/apis/eksctl.io/v1alpha5"
11
14
"github.com/weaveworks/eksctl/pkg/kubernetes"
15
+ "github.com/weaveworks/eksctl/pkg/utils/instance"
12
16
13
17
appsv1 "k8s.io/api/apps/v1"
14
- v1 "k8s.io/api/core/v1"
18
+ corev1 "k8s.io/api/core/v1"
15
19
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
16
20
"k8s.io/apimachinery/pkg/watch"
17
21
clientappsv1 "k8s.io/client-go/kubernetes/typed/apps/v1"
18
-
19
- // For go:embed
20
- _ "embed"
21
22
)
22
23
23
24
//go:embed assets/efa-device-plugin.yaml
@@ -29,7 +30,7 @@ var neuronDevicePluginYaml []byte
29
30
//go:embed assets/nvidia-device-plugin.yaml
30
31
var nvidiaDevicePluginYaml []byte
31
32
32
- func useRegionalImage (spec * v1 .PodTemplateSpec , region string , account string ) error {
33
+ func useRegionalImage (spec * corev1 .PodTemplateSpec , region string , account string ) error {
33
34
imageFormat := spec .Spec .Containers [0 ].Image
34
35
dnsSuffix , err := awsDNSSuffixForRegion (region )
35
36
if err != nil {
@@ -72,13 +73,14 @@ func watchDaemonSetReady(dsClientSet clientappsv1.DaemonSetInterface, dsName str
72
73
}
73
74
}
74
75
75
- type MkDevicePlugin func (rawClient kubernetes.RawClientInterface , region string , planMode bool ) DevicePlugin
76
+ type MkDevicePlugin func (rawClient kubernetes.RawClientInterface , region string , planMode bool , spec * api. ClusterConfig ) DevicePlugin
76
77
77
78
type DevicePlugin interface {
78
79
RawClient () kubernetes.RawClientInterface
79
80
PlanMode () bool
80
81
Manifest () []byte
81
- SetImage (t * v1.PodTemplateSpec ) error
82
+ SetImage (t * corev1.PodTemplateSpec ) error
83
+ SetTolerations (t * corev1.PodTemplateSpec ) error
82
84
Deploy () error
83
85
}
84
86
@@ -103,7 +105,9 @@ func applyDevicePlugin(dp DevicePlugin) error {
103
105
if err := dp .SetImage (& daemonSet .Spec .Template ); err != nil {
104
106
return errors .Wrap (err , "setting image of device plugin daemonset" )
105
107
}
106
-
108
+ if err := dp .SetTolerations (& daemonSet .Spec .Template ); err != nil {
109
+ return errors .Wrap (err , "adding tolerations to device plugin daemonset" )
110
+ }
107
111
msg , err := rawResource .CreateOrReplace (dp .PlanMode ())
108
112
if err != nil {
109
113
return errors .Wrap (err , "calling create or replace on raw device plugin daemonset" )
@@ -124,11 +128,12 @@ func applyDevicePlugin(dp DevicePlugin) error {
124
128
}
125
129
126
130
// NewNeuronDevicePlugin creates a new NeuronDevicePlugin
127
- func NewNeuronDevicePlugin (rawClient kubernetes.RawClientInterface , region string , planMode bool ) DevicePlugin {
131
+ func NewNeuronDevicePlugin (rawClient kubernetes.RawClientInterface , region string , planMode bool , spec * api. ClusterConfig ) DevicePlugin {
128
132
return & NeuronDevicePlugin {
129
- rawClient ,
130
- region ,
131
- planMode ,
133
+ rawClient : rawClient ,
134
+ region : region ,
135
+ planMode : planMode ,
136
+ spec : spec ,
132
137
}
133
138
}
134
139
@@ -137,6 +142,7 @@ type NeuronDevicePlugin struct {
137
142
rawClient kubernetes.RawClientInterface
138
143
region string
139
144
planMode bool
145
+ spec * api.ClusterConfig
140
146
}
141
147
142
148
func (n * NeuronDevicePlugin ) RawClient () kubernetes.RawClientInterface {
@@ -151,7 +157,11 @@ func (n *NeuronDevicePlugin) Manifest() []byte {
151
157
return neuronDevicePluginYaml
152
158
}
153
159
154
- func (n * NeuronDevicePlugin ) SetImage (t * v1.PodTemplateSpec ) error {
160
+ func (n * NeuronDevicePlugin ) SetImage (t * corev1.PodTemplateSpec ) error {
161
+ return nil
162
+ }
163
+
164
+ func (n * NeuronDevicePlugin ) SetTolerations (t * corev1.PodTemplateSpec ) error {
155
165
return nil
156
166
}
157
167
@@ -161,11 +171,12 @@ func (n *NeuronDevicePlugin) Deploy() error {
161
171
}
162
172
163
173
// NewNvidiaDevicePlugin creates a new NvidiaDevicePlugin
164
- func NewNvidiaDevicePlugin (rawClient kubernetes.RawClientInterface , region string , planMode bool ) DevicePlugin {
174
+ func NewNvidiaDevicePlugin (rawClient kubernetes.RawClientInterface , region string , planMode bool , spec * api. ClusterConfig ) DevicePlugin {
165
175
return & NvidiaDevicePlugin {
166
- rawClient ,
167
- region ,
168
- planMode ,
176
+ rawClient : rawClient ,
177
+ region : region ,
178
+ planMode : planMode ,
179
+ spec : spec ,
169
180
}
170
181
}
171
182
@@ -174,6 +185,7 @@ type NvidiaDevicePlugin struct {
174
185
rawClient kubernetes.RawClientInterface
175
186
region string
176
187
planMode bool
188
+ spec * api.ClusterConfig
177
189
}
178
190
179
191
func (n * NvidiaDevicePlugin ) RawClient () kubernetes.RawClientInterface {
@@ -184,7 +196,7 @@ func (n *NvidiaDevicePlugin) PlanMode() bool {
184
196
return n .planMode
185
197
}
186
198
187
- func (n * NvidiaDevicePlugin ) SetImage (t * v1 .PodTemplateSpec ) error {
199
+ func (n * NvidiaDevicePlugin ) SetImage (t * corev1 .PodTemplateSpec ) error {
188
200
return nil
189
201
}
190
202
@@ -197,11 +209,59 @@ func (n *NvidiaDevicePlugin) Deploy() error {
197
209
return applyDevicePlugin (n )
198
210
}
199
211
212
+ // SetTolerations sets given tolerations on the DaemonSet if they don't already exist.
213
+ // We check the taints on each node which is an NVIDIA instance type and apply
214
+ // tolerations for all the taints defined on the node.
215
+ func (n * NvidiaDevicePlugin ) SetTolerations (spec * corev1.PodTemplateSpec ) error {
216
+ contains := func (list []corev1.Toleration , key string ) bool {
217
+ for _ , t := range list {
218
+ if t .Key == key {
219
+ return true
220
+ }
221
+ }
222
+ return false
223
+ }
224
+ // don't duplicate taints from other nodes or overwrite them with
225
+ // different values ( shouldn't happen in general... )
226
+ taints := make (map [string ]api.NodeGroupTaint )
227
+ for _ , ng := range n .spec .NodeGroups {
228
+ if api .HasInstanceType (ng , instance .IsNvidiaInstanceType ) &&
229
+ ng .GetAMIFamily () == api .NodeImageFamilyAmazonLinux2 {
230
+ for _ , taint := range ng .Taints {
231
+ if _ , ok := taints [taint .Key ]; ! ok {
232
+ taints [taint .Key ] = taint
233
+ }
234
+ }
235
+ }
236
+ }
237
+ for _ , ng := range n .spec .ManagedNodeGroups {
238
+ if api .HasInstanceTypeManaged (ng , instance .IsNvidiaInstanceType ) &&
239
+ ng .GetAMIFamily () == api .NodeImageFamilyAmazonLinux2 {
240
+ for _ , taint := range ng .Taints {
241
+ if _ , ok := taints [taint .Key ]; ! ok {
242
+ taints [taint .Key ] = taint
243
+ }
244
+ }
245
+ }
246
+ }
247
+ for _ , t := range taints {
248
+ // only add toleration if it doesn't already exist. In that case, we don't overwrite it.
249
+ if ! contains (spec .Spec .Tolerations , t .Key ) {
250
+ spec .Spec .Tolerations = append (spec .Spec .Tolerations , corev1.Toleration {
251
+ Key : t .Key ,
252
+ Value : t .Value ,
253
+ })
254
+ }
255
+ }
256
+ return nil
257
+ }
258
+
200
259
// A EFADevicePlugin deploys the EFA Device Plugin to a cluster
201
260
type EFADevicePlugin struct {
202
261
rawClient kubernetes.RawClientInterface
203
262
region string
204
263
planMode bool
264
+ spec * api.ClusterConfig
205
265
}
206
266
207
267
func (n * EFADevicePlugin ) RawClient () kubernetes.RawClientInterface {
@@ -216,17 +276,22 @@ func (n *EFADevicePlugin) Manifest() []byte {
216
276
return efaDevicePluginYaml
217
277
}
218
278
219
- func (n * EFADevicePlugin ) SetImage (t * v1 .PodTemplateSpec ) error {
279
+ func (n * EFADevicePlugin ) SetImage (t * corev1 .PodTemplateSpec ) error {
220
280
account := api .EKSResourceAccountID (n .region )
221
281
return useRegionalImage (t , n .region , account )
222
282
}
223
283
284
+ func (n * EFADevicePlugin ) SetTolerations (spec * corev1.PodTemplateSpec ) error {
285
+ return nil
286
+ }
287
+
224
288
// NewEFADevicePlugin creates a new EFADevicePlugin
225
- func NewEFADevicePlugin (rawClient kubernetes.RawClientInterface , region string , planMode bool ) DevicePlugin {
289
+ func NewEFADevicePlugin (rawClient kubernetes.RawClientInterface , region string , planMode bool , spec * api. ClusterConfig ) DevicePlugin {
226
290
return & EFADevicePlugin {
227
- rawClient ,
228
- region ,
229
- planMode ,
291
+ rawClient : rawClient ,
292
+ region : region ,
293
+ planMode : planMode ,
294
+ spec : spec ,
230
295
}
231
296
}
232
297
0 commit comments