forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmulti_node_test.py
299 lines (237 loc) · 9.28 KB
/
multi_node_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import ray
import subprocess
import sys
import tempfile
import time
from ray.test.test_utils import run_and_get_output
def run_string_as_driver(driver_script):
"""Run a driver as a separate process.
Args:
driver_script: A string to run as a Python script.
Returns:
The scripts output.
"""
# Save the driver script as a file so we can call it using subprocess.
with tempfile.NamedTemporaryFile() as f:
f.write(driver_script.encode("ascii"))
f.flush()
out = subprocess.check_output([sys.executable, f.name]).decode("ascii")
return out
class MultiNodeTest(unittest.TestCase):
def setUp(self):
out = run_and_get_output(["ray", "start", "--head"])
# Get the redis address from the output.
redis_substring_prefix = "redis_address=\""
redis_address_location = (
out.find(redis_substring_prefix) + len(redis_substring_prefix))
redis_address = out[redis_address_location:]
self.redis_address = redis_address.split("\"")[0]
def tearDown(self):
ray.worker.cleanup()
# Kill the Ray cluster.
subprocess.Popen(["ray", "stop"]).wait()
def testErrorIsolation(self):
# Connect a driver to the Ray cluster.
ray.init(redis_address=self.redis_address, driver_mode=ray.SILENT_MODE)
# There shouldn't be any errors yet.
self.assertEqual(len(ray.error_info()), 0)
error_string1 = "error_string1"
error_string2 = "error_string2"
@ray.remote
def f():
raise Exception(error_string1)
# Run a remote function that throws an error.
with self.assertRaises(Exception):
ray.get(f.remote())
# Wait for the error to appear in Redis.
while len(ray.error_info()) != 1:
time.sleep(0.1)
print("Waiting for error to appear.")
# Make sure we got the error.
self.assertEqual(len(ray.error_info()), 1)
self.assertIn(error_string1,
ray.error_info()[0][b"message"].decode("ascii"))
# Start another driver and make sure that it does not receive this
# error. Make the other driver throw an error, and make sure it
# receives that error.
driver_script = """
import ray
import time
ray.init(redis_address="{}")
time.sleep(1)
assert len(ray.error_info()) == 0
@ray.remote
def f():
raise Exception("{}")
try:
ray.get(f.remote())
except Exception as e:
pass
while len(ray.error_info()) != 1:
print(len(ray.error_info()))
time.sleep(0.1)
assert len(ray.error_info()) == 1
assert "{}" in ray.error_info()[0][b"message"].decode("ascii")
print("success")
""".format(self.redis_address, error_string2, error_string2)
out = run_string_as_driver(driver_script)
# Make sure the other driver succeeded.
self.assertIn("success", out)
# Make sure that the other error message doesn't show up for this
# driver.
self.assertEqual(len(ray.error_info()), 1)
self.assertIn(error_string1,
ray.error_info()[0][b"message"].decode("ascii"))
def testRemoteFunctionIsolation(self):
# This test will run multiple remote functions with the same names in
# two different drivers. Connect a driver to the Ray cluster.
ray.init(redis_address=self.redis_address, driver_mode=ray.SILENT_MODE)
# Start another driver and make sure that it can define and call its
# own commands with the same names.
driver_script = """
import ray
import time
ray.init(redis_address="{}")
@ray.remote
def f():
return 3
@ray.remote
def g(x, y):
return 4
for _ in range(10000):
result = ray.get([f.remote(), g.remote(0, 0)])
assert result == [3, 4]
print("success")
""".format(self.redis_address)
out = run_string_as_driver(driver_script)
@ray.remote
def f():
return 1
@ray.remote
def g(x):
return 2
for _ in range(10000):
result = ray.get([f.remote(), g.remote(0)])
self.assertEqual(result, [1, 2])
# Make sure the other driver succeeded.
self.assertIn("success", out)
def testDriverExitingQuickly(self):
# This test will create some drivers that submit some tasks and then
# exit without waiting for the tasks to complete.
ray.init(redis_address=self.redis_address, driver_mode=ray.SILENT_MODE)
# Define a driver that creates an actor and exits.
driver_script1 = """
import ray
ray.init(redis_address="{}")
@ray.remote
class Foo(object):
def __init__(self):
pass
Foo.remote()
print("success")
""".format(self.redis_address)
# Define a driver that creates some tasks and exits.
driver_script2 = """
import ray
ray.init(redis_address="{}")
@ray.remote
def f():
return 1
f.remote()
print("success")
""".format(self.redis_address)
# Create some drivers and let them exit and make sure everything is
# still alive.
for _ in range(3):
out = run_string_as_driver(driver_script1)
# Make sure the first driver ran to completion.
self.assertIn("success", out)
out = run_string_as_driver(driver_script2)
# Make sure the first driver ran to completion.
self.assertIn("success", out)
self.assertTrue(ray.services.all_processes_alive())
class StartRayScriptTest(unittest.TestCase):
def testCallingStartRayHead(self):
# Test that we can call start-ray.sh with various command line
# parameters. TODO(rkn): This test only tests the --head code path. We
# should also test the non-head node code path.
# Test starting Ray with no arguments.
run_and_get_output(["ray", "start", "--head"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with a number of workers specified.
run_and_get_output(["ray", "start", "--head", "--num-workers", "20"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with a redis port specified.
run_and_get_output(["ray", "start", "--head", "--redis-port", "6379"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with redis shard ports specified.
run_and_get_output([
"ray", "start", "--head", "--redis-shard-ports", "6380,6381,6382"
])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with a node IP address specified.
run_and_get_output(
["ray", "start", "--head", "--node-ip-address", "127.0.0.1"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with an object manager port specified.
run_and_get_output(
["ray", "start", "--head", "--object-manager-port", "12345"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with the number of CPUs specified.
run_and_get_output(["ray", "start", "--head", "--num-cpus", "100"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with the number of GPUs specified.
run_and_get_output(["ray", "start", "--head", "--num-gpus", "100"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with the max redis clients specified.
run_and_get_output(
["ray", "start", "--head", "--redis-max-clients", "100"])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with all arguments specified.
run_and_get_output([
"ray", "start", "--head", "--num-workers", "20", "--redis-port",
"6379", "--redis-shard-ports", "6380,6381,6382",
"--object-manager-port", "12345", "--num-cpus", "100",
"--num-gpus", "0", "--redis-max-clients", "100", "--resources",
"{\"Custom\": 1}"
])
subprocess.Popen(["ray", "stop"]).wait()
# Test starting Ray with invalid arguments.
with self.assertRaises(Exception):
run_and_get_output([
"ray", "start", "--head", "--redis-address", "127.0.0.1:6379"
])
subprocess.Popen(["ray", "stop"]).wait()
def testUsingHostnames(self):
# Start the Ray processes on this machine.
run_and_get_output([
"ray", "start", "--head", "--node-ip-address=localhost",
"--redis-port=6379"
])
ray.init(node_ip_address="localhost", redis_address="localhost:6379")
@ray.remote
def f():
return 1
self.assertEqual(ray.get(f.remote()), 1)
# Kill the Ray cluster.
subprocess.Popen(["ray", "stop"]).wait()
class MiscellaneousTest(unittest.TestCase):
def tearDown(self):
ray.worker.cleanup()
def testConnectingInLocalCase(self):
address_info = ray.init(num_cpus=0)
# Define a driver that just connects to Redis.
driver_script = """
import ray
ray.init(redis_address="{}")
print("success")
""".format(address_info["redis_address"])
out = run_string_as_driver(driver_script)
# Make sure the other driver succeeded.
self.assertIn("success", out)
if __name__ == "__main__":
unittest.main(verbosity=2)