-
Notifications
You must be signed in to change notification settings - Fork 6k
/
Copy pathrllib.rayci.yml
271 lines (250 loc) · 9.78 KB
/
rllib.rayci.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
group: rllib tests
depends_on:
- forge
steps:
# builds
- name: rllibbuild
wanda: ci/docker/rllib.build.wanda.yaml
depends_on: oss-ci-base_ml
env:
IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_ml
IMAGE_TO: rllibbuild
RAYCI_IS_GPU_BUILD: "false"
- name: rllibgpubuild
wanda: ci/docker/rllib.build.wanda.yaml
depends_on: oss-ci-base_gpu
env:
IMAGE_FROM: cr.ray.io/rayproject/oss-ci-base_gpu
IMAGE_TO: rllibgpubuild
RAYCI_IS_GPU_BUILD: "true"
# tests
- label: ":brain: rllib: algorithm, model and others"
tags: rllib_directly
parallelism: 4
instance_type: large
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--except-tags learning_tests,memory_leak_tests,examples,tests_dir,documentation,multi_gpu,no_cpu,torch_2.x_only_benchmark,manual
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
depends_on: rllibbuild
- label: ":brain: rllib: learning tests pytorch"
tags: rllib
parallelism: 5
instance_type: large
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--only-tags fake_gpus,learning_tests_discrete,crashing_cartpole,stateless_cartpole,learning_tests_continuous
--except-tags tf_only,tf2_only,gpu,multi_gpu,learning_tests_pytorch_use_all_core
--test-arg --framework=torch
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
--only-tags learning_tests_pytorch_use_all_core
--except-tags tf_only,tf2_only,gpu,multi_gpu
--test-arg --framework=torch
--skip-ray-installation
depends_on: rllibbuild
- label: ":brain: rllib: examples"
tags: rllib
parallelism: 6
instance_type: large
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 2
--only-tags examples
--except-tags multi_gpu,gpu,examples_use_all_core
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
--only-tags examples_use_all_core
--skip-ray-installation
--except-tags multi_gpu,gpu
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
depends_on: rllibbuild
- label: ":brain: rllib: tests dir"
tags: rllib_directly
parallelism: 2
instance_type: large
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--only-tags tests_dir
--except-tags multi_gpu,manual
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
depends_on: rllibbuild
- label: ":brain: rllib: gpu tests"
tags:
- rllib_gpu
- gpu
- skip-on-microcheck
parallelism: 5
instance_type: gpu
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
--build-name rllibgpubuild
--only-tags gpu
--test-env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--test-env=RLLIB_NUM_GPUS=1
depends_on: rllibgpubuild
- label: ":brain: rllib: data tests"
if: build.branch != "master"
tags:
- data
- rllib
instance_type: large
commands:
# learning tests pytorch
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--parallelism-per-worker 3
--only-tags learning_tests_with_ray_data
--except-tags multi_gpu,gpu,tf_only,tf2_only
--test-arg --framework=torch
# rllib unittests
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--parallelism-per-worker 3
--only-tags ray_data
--except-tags learning_tests_with_ray_data,multi_gpu,gpu
--skip-ray-installation # reuse the same docker image as the previous run
depends_on: rllibbuild
- label: ":brain: rllib: benchmarks"
tags: rllib
instance_type: medium
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --only-tags torch_2.x_only_benchmark
depends_on: rllibbuild
# - label: ":brain: rllib: memory leak pytorch tests"
# tags: rllib
# instance_type: medium
# commands:
# - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
# --only-tags memory_leak_tests
# --except-tags flaky
# --test-arg --framework=torch
# depends_on: rllibbuild
- label: ":brain: rllib: doc tests"
tags:
- rllib_directly
- doc
- skip-on-microcheck
instance_type: medium
commands:
# doc tests
- bazel run //ci/ray_ci:test_in_docker -- python/ray/... //doc/... rllib
--except-tags gpu
--only-tags doctest
--parallelism-per-worker 2
# doc examples
- bazel run //ci/ray_ci:test_in_docker -- //doc/... rllib
--except-tags gpu,post_wheel_build,timeseries_libs,doctest
--parallelism-per-worker 2
--skip-ray-installation
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--only-tags documentation
--parallelism-per-worker 2
--skip-ray-installation
depends_on: rllibbuild
- label: ":brain: rllib: multi-gpu tests"
tags:
- rllib_gpu
- gpu
- skip-on-microcheck
parallelism: 5
instance_type: gpu-large
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
--parallelism-per-worker 2
--gpus 4
--build-name rllibgpubuild
--only-tags multi_gpu
depends_on: rllibgpubuild
- label: ":brain: rllib: flaky multi-gpu tests"
key: rllib_flaky_multi_gpu_tests
tags:
- rllib_gpu
- gpu
- skip-on-premerge
instance_type: gpu-large
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
--parallelism-per-worker 2
--gpus 4
--build-name rllibgpubuild
--only-tags multi_gpu
depends_on: rllibgpubuild
soft_fail: true
- label: ":brain: rllib: flaky gpu tests"
key: rllib_flaky_gpu_tests
tags:
- rllib_gpu
- gpu
- skip-on-premerge
instance_type: gpu
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
--build-name rllibgpubuild
--only-tags gpu
--test-env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--test-env=RLLIB_NUM_GPUS=1
depends_on: rllibgpubuild
soft_fail: true
- label: ":brain: rllib: flaky tests (learning tests)"
key: rllib_flaky_tests_01
tags:
- rllib
- skip-on-premerge
instance_type: large
commands:
# torch
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
--only-tags fake_gpus,learning_tests_discrete,learning_tests_with_ray_data,crashing_cartpole,stateless_cartpole,learning_tests_continuous
--except-tags tf_only,tf2_only,multi_gpu,gpu
--test-arg --framework=torch
# tf2-static-graph
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
--only-tags tf_only
--except-tags torch_only,tf2_only,no_tf_static_graph,multi_gpu,gpu
--test-arg --framework=tf
--skip-ray-installation # reuse the same docker image as the previous run
# tf2-eager-tracing
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
--only-tags tf2_only
--except-tags fake_gpus,torch_only,multi_gpu,no_tf_eager_tracing,gpu
--test-arg --framework=tf2
--skip-ray-installation # reuse the same docker image as the previous run
depends_on: rllibbuild
soft_fail: true
- label: ":brain: rllib: flaky tests (examples/rlmodule/models/tests_dir)"
key: rllib_flaky_tests_02
tags:
- rllib
- skip-on-premerge
instance_type: large
commands:
# examples
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
--only-tags examples
--except-tags multi_gpu,gpu
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
# rlmodule tests
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
--only-tags rlm
--except-tags multi_gpu,gpu
--test-env RLLIB_ENABLE_RL_MODULE=1
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--skip-ray-installation # reuse the same docker image as the previous run
# algorithm, models
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
--except-tags learning_tests,memory_leak_tests,examples,tests_dir,documentation,multi_gpu,gpu,no_cpu,torch_2.x_only_benchmark,manual
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--skip-ray-installation # reuse the same docker image as the previous run
# tests/ dir
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
--only-tags tests_dir
--except-tags multi_gpu,gpu,manual
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--skip-ray-installation # reuse the same docker image as the previous run
depends_on: rllibbuild
soft_fail: true