|
| 1 | +import json |
1 | 2 | import os |
2 | 3 | import sys |
3 | 4 |
|
|
19 | 20 | import psutil |
20 | 21 |
|
21 | 22 |
|
| 23 | +def create_failure_json(method, num_failures, failure_str): |
| 24 | + parts = failure_str.split(":") |
| 25 | + return json.dumps( |
| 26 | + { |
| 27 | + method: { |
| 28 | + "num_failures": num_failures, |
| 29 | + "req_failure_prob": int(parts[0]), |
| 30 | + "resp_failure_prob": int(parts[1]), |
| 31 | + "in_flight_failure_prob": int(parts[2]), |
| 32 | + } |
| 33 | + } |
| 34 | + ) |
| 35 | + |
| 36 | + |
22 | 37 | @pytest.mark.parametrize("deterministic_failure", RPC_FAILURE_TYPES) |
23 | 38 | def test_request_worker_lease_idempotent( |
24 | 39 | monkeypatch, shutdown_only, deterministic_failure, ray_start_cluster |
25 | 40 | ): |
26 | 41 | failure = RPC_FAILURE_MAP[deterministic_failure] |
27 | 42 | monkeypatch.setenv( |
28 | 43 | "RAY_testing_rpc_failure", |
29 | | - f"NodeManagerService.grpc_client.RequestWorkerLease=1:{failure}", |
| 44 | + create_failure_json( |
| 45 | + "NodeManagerService.grpc_client.RequestWorkerLease", 1, failure |
| 46 | + ), |
30 | 47 | ) |
31 | 48 |
|
32 | 49 | @ray.remote |
@@ -61,7 +78,16 @@ def test_drain_node_idempotent(monkeypatch, shutdown_only, ray_start_cluster): |
61 | 78 | # NOTE: not testing response failure since the node is already marked as draining and shuts down gracefully. |
62 | 79 | monkeypatch.setenv( |
63 | 80 | "RAY_testing_rpc_failure", |
64 | | - "NodeManagerService.grpc_client.DrainRaylet=1:100:0:0", |
| 81 | + json.dumps( |
| 82 | + { |
| 83 | + "NodeManagerService.grpc_client.DrainRaylet": { |
| 84 | + "num_failures": 1, |
| 85 | + "req_failure_prob": 100, |
| 86 | + "resp_failure_prob": 0, |
| 87 | + "in_flight_failure_prob": 0, |
| 88 | + } |
| 89 | + } |
| 90 | + ), |
65 | 91 | ) |
66 | 92 |
|
67 | 93 | cluster = ray_start_cluster |
@@ -99,10 +125,25 @@ def node_is_dead(): |
99 | 125 | def inject_release_unused_bundles_rpc_failure(monkeypatch, request): |
100 | 126 | deterministic_failure = request.param |
101 | 127 | failure = RPC_FAILURE_MAP[deterministic_failure] |
| 128 | + parts = failure.split(":") |
102 | 129 | monkeypatch.setenv( |
103 | 130 | "RAY_testing_rpc_failure", |
104 | | - f"NodeManagerService.grpc_client.ReleaseUnusedBundles=1:{failure}" |
105 | | - + ",NodeManagerService.grpc_client.CancelResourceReserve=-1:100:0:0", |
| 131 | + json.dumps( |
| 132 | + { |
| 133 | + "NodeManagerService.grpc_client.ReleaseUnusedBundles": { |
| 134 | + "num_failures": 1, |
| 135 | + "req_failure_prob": int(parts[0]), |
| 136 | + "resp_failure_prob": int(parts[1]), |
| 137 | + "in_flight_failure_prob": int(parts[2]), |
| 138 | + }, |
| 139 | + "NodeManagerService.grpc_client.CancelResourceReserve": { |
| 140 | + "num_failures": -1, |
| 141 | + "req_failure_prob": 100, |
| 142 | + "resp_failure_prob": 0, |
| 143 | + "in_flight_failure_prob": 0, |
| 144 | + }, |
| 145 | + } |
| 146 | + ), |
106 | 147 | ) |
107 | 148 |
|
108 | 149 |
|
@@ -155,7 +196,9 @@ def inject_notify_gcs_restart_rpc_failure(monkeypatch, request): |
155 | 196 | failure = RPC_FAILURE_MAP[deterministic_failure] |
156 | 197 | monkeypatch.setenv( |
157 | 198 | "RAY_testing_rpc_failure", |
158 | | - f"NodeManagerService.grpc_client.NotifyGCSRestart=1:{failure}", |
| 199 | + create_failure_json( |
| 200 | + "NodeManagerService.grpc_client.NotifyGCSRestart", 1, failure |
| 201 | + ), |
159 | 202 | ) |
160 | 203 |
|
161 | 204 |
|
@@ -215,7 +258,16 @@ def test_kill_local_actor_rpc_retry_and_idempotency(monkeypatch, shutdown_only): |
215 | 258 |
|
216 | 259 | monkeypatch.setenv( |
217 | 260 | "RAY_testing_rpc_failure", |
218 | | - "NodeManagerService.grpc_client.KillLocalActor=1:100:0:0", |
| 261 | + json.dumps( |
| 262 | + { |
| 263 | + "NodeManagerService.grpc_client.KillLocalActor": { |
| 264 | + "num_failures": 1, |
| 265 | + "req_failure_prob": 100, |
| 266 | + "resp_failure_prob": 0, |
| 267 | + "in_flight_failure_prob": 0, |
| 268 | + } |
| 269 | + } |
| 270 | + ), |
219 | 271 | ) |
220 | 272 |
|
221 | 273 | ray.init() |
|
0 commit comments