forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathray_v1alpha1_rayservice_template.yaml
305 lines (304 loc) · 9.9 KB
/
ray_v1alpha1_rayservice_template.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
kind: ConfigMap
apiVersion: v1
metadata:
name: locusttest-{cluster_id}
data:
locustfile.py: |
{locustfile}
---
kind: ConfigMap
apiVersion: v1
metadata:
name: script-{cluster_id}
data:
solution.py: |
{solution}
---
kind: ConfigMap
apiVersion: v1
metadata:
name: redis-config-{cluster_id}
labels:
app: redis
data:
redis.conf: |-
dir /data
port 6379
bind 0.0.0.0
appendonly yes
protected-mode no
requirepass 5241590000000000
pidfile /data/redis-6379.pid
---
apiVersion: v1
kind: Service
metadata:
name: redis-{cluster_id}
labels:
app: redis
spec:
type: ClusterIP
ports:
- name: redis
port: 6379
selector:
app: redis
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis-{cluster_id}
labels:
app: redis
spec:
replicas: 1
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
spec:
containers:
- name: redis
image: redis:5.0.8
command:
- "sh"
- "-c"
- "redis-server /usr/local/etc/redis/redis.conf"
ports:
- containerPort: 6379
volumeMounts:
- name: config
mountPath: /usr/local/etc/redis/redis.conf
subPath: redis.conf
volumes:
- name: config
configMap:
name: redis-config-{cluster_id}
---
apiVersion: ray.io/v1alpha1
kind: RayService
metadata:
name: service-{cluster_id}
annotations:
ray.io/ft-enabled: "true"
spec:
serviceUnhealthySecondThreshold: 300
deploymentUnhealthySecondThreshold: 300
serveConfig:
importPath: solution.serve_entrypoint
runtimeEnv: |
env_vars:
PYTHONPATH: "/tmp/testing/"
deployments:
- name: a
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: b
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: c
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: d
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: e
numReplicas: 6
rayActorOptions:
numCpus: 1
- name: DAGDriver
numReplicas: 6
rayActorOptions:
numCpus: 1
rayClusterConfig:
rayVersion: '3.0.0.dev0' # should match the Ray version in the image of the containers
######################headGroupSpecs#################################
# head group template and specs, (perhaps 'group' is not needed in the name)
headGroupSpec:
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
serviceType: ClusterIP
# the pod replicas in this group typed head (assuming there could be more than 1 in the future)
replicas: 1
# logical group name, for this called head-group, also can be functional
# pod type head or worker
# rayNodeType: head # Not needed since it is under the headgroup
# the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ...
rayStartParams:
port: '6379' # should match container port named gcs-server
object-store-memory: '100000000'
dashboard-host: '0.0.0.0'
num-cpus: '0' # can be auto-completed from the limits
node-ip-address: $MY_POD_IP # auto-completed as the head pod IP
block: 'true'
#pod template
template:
metadata:
labels:
rayCluster: cluster-{cluster_id}
rayNodeType: head # will be injected if missing, must be head or wroker
groupName: headgroup # will be injected if missing
# annotations for pod
annotations:
key: value
spec:
volumes:
- name: script
configMap:
name: script-{cluster_id}
- name: log-volume
emptyDir: {{}}
containers:
- name: ray-head
image: {ray_image}
imagePullPolicy: Always
env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: RAY_REDIS_ADDRESS
value: redis-{cluster_id}:6379
- name: RAY_gcs_rpc_server_reconnect_timeout_s
value: "600"
- name: SERVE_DEPLOYMENT_HANDLE_IS_SYNC
value: "1"
resources:
limits:
cpu: 2
requests:
cpu: 2
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265 # Ray dashboard
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
volumeMounts:
- name: script
mountPath: /tmp/testing/solution.py
subPath: solution.py
- mountPath: /tmp/ray/
name: log-volume
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 12
minReplicas: 12
maxReplicas: 12
# logical group name, for this called small-group, also can be functional
groupName: small-group
# if worker pods need to be added, we can simply increment the replicas
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
# the operator will remove pods from the list until the number of replicas is satisfied
# when a pod is confirmed to be deleted, its name will be removed from the list below
#scaleStrategy:
# workersToDelete:
# - raycluster-complete-worker-small-group-bdtwh
# - raycluster-complete-worker-small-group-hv457
# - raycluster-complete-worker-small-group-k8tj7
# the following params are used to complete the ray start: ray start --block --node-ip-address= ...
rayStartParams:
node-ip-address: $MY_POD_IP
block: 'true'
num-cpus: '4' # can be auto-completed from the limits
#pod template
template:
metadata:
labels:
key: value
rayCluster: cluster-{cluster_id}
# annotations for pod
annotations:
key: value
spec:
initContainers:
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
- name: init-myservice
image: busybox:1.28
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
volumes:
- name: script
configMap:
name: script-{cluster_id}
- name: log-volume
emptyDir: {{}}
containers:
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
image: {ray_image}
imagePullPolicy: Always
livenessProbe:
initialDelaySeconds: 30
periodSeconds: 5
timeoutSeconds: 10
readinessProbe:
initialDelaySeconds: 30
periodSeconds: 5
timeoutSeconds: 10
# environment variables to set in the container.Optional.
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
env:
- name: RAY_DISABLE_DOCKER_CPU_WARNING
value: "1"
- name: TYPE
value: "worker"
- name: CPU_REQUEST
valueFrom:
resourceFieldRef:
containerName: machine-learning
resource: requests.cpu
- name: CPU_LIMITS
valueFrom:
resourceFieldRef:
containerName: machine-learning
resource: limits.cpu
- name: MEMORY_LIMITS
valueFrom:
resourceFieldRef:
containerName: machine-learning
resource: limits.memory
- name: MEMORY_REQUESTS
valueFrom:
resourceFieldRef:
containerName: machine-learning
resource: requests.memory
- name: MY_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: RAY_gcs_rpc_server_reconnect_timeout_s
value: "600"
- name: RAY_gcs_server_request_timeout_seconds
value: "5"
- name: SERVE_DEPLOYMENT_HANDLE_IS_SYNC
value: "1"
ports:
- containerPort: 80
name: client
lifecycle:
preStop:
exec:
command: ["/bin/sh","-c","ray stop"]
resources:
limits:
cpu: "2"
requests:
cpu: "2"
volumeMounts:
- name: script
mountPath: /tmp/testing/solution.py
subPath: solution.py
- mountPath: /tmp/ray/
name: log-volume