Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make wait_for_condition raise exception when timing out. #9710

Merged
merged 5 commits into from
Jul 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/ray/serve/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ def check_dead():
pass
return True

assert wait_for_condition(check_dead)
wait_for_condition(check_dead)


def test_shadow_traffic(serve_instance):
Expand Down Expand Up @@ -622,7 +622,7 @@ def check_requests():
requests_to_backend("backend4") > 0,
])

assert wait_for_condition(check_requests)
wait_for_condition(check_requests)


if __name__ == "__main__":
Expand Down
10 changes: 5 additions & 5 deletions python/ray/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,22 +239,22 @@ def wait_for_errors(error_type, num_errors, timeout=20):


def wait_for_condition(condition_predictor, timeout=30, retry_interval_ms=100):
"""A helper function that waits until a condition is met.
"""Wait until a condition is met or time out with an exception.

Args:
condition_predictor: A function that predicts the condition.
timeout: Maximum timeout in seconds.
retry_interval_ms: Retry interval in milliseconds.

Return:
Whether the condition is met within the timeout.
Raises:
RuntimeError: If the condition is not met before the timeout expires.
"""
start = time.time()
while time.time() - start <= timeout:
if condition_predictor():
return True
return
time.sleep(retry_interval_ms / 1000.0)
return False
raise RuntimeError("The condition wasn't met before the timeout expired.")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the key change.



def wait_until_succeeded_without_exception(func,
Expand Down
8 changes: 4 additions & 4 deletions python/ray/tests/test_actor_failures.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,14 +361,14 @@ def actor_resource_available():

ray.experimental.set_resource("actor", 1)
actor = RestartableActor.remote()
assert wait_for_condition(lambda: not actor_resource_available())
wait_for_condition(lambda: not actor_resource_available())
# Kill the actor.
pid = ray.get(actor.get_pid.remote())

p = probe.remote()
os.kill(pid, SIGKILL)
ray.get(p)
assert wait_for_condition(lambda: not actor_resource_available())
wait_for_condition(lambda: not actor_resource_available())


def test_caller_actor_restart(ray_start_regular):
Expand Down Expand Up @@ -868,7 +868,7 @@ def actor_dead():
cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1])
# Repeatedly submit tasks and call ray.wait until the exception for the
# dead actor is received.
assert wait_for_condition(actor_dead)
wait_for_condition(actor_dead)

# Create an actor on the local node that will call ray.wait in a loop.
head_node_resource = "HEAD_NODE"
Expand All @@ -888,7 +888,7 @@ def ping(self):
# Repeatedly call ray.wait through the local actor until the exception for
# the dead actor is received.
parent_actor = ParentActor.remote()
assert wait_for_condition(lambda: ray.get(parent_actor.wait.remote()))
wait_for_condition(lambda: ray.get(parent_actor.wait.remote()))


if __name__ == "__main__":
Expand Down
8 changes: 4 additions & 4 deletions python/ray/tests/test_failure.py
Original file line number Diff line number Diff line change
Expand Up @@ -1098,21 +1098,21 @@ def test_process_failure(use_actors):
pid = ray.get(a.get_pid.remote())
a.start_child.remote(use_actors=use_actors)
# Wait for the child to be scheduled.
assert wait_for_condition(lambda: not child_resource_available())
wait_for_condition(lambda: not child_resource_available())
# Kill the parent process.
os.kill(pid, 9)
assert wait_for_condition(child_resource_available)
wait_for_condition(child_resource_available)

# Test fate sharing if the parent node dies.
def test_node_failure(node_to_kill, use_actors):
a = Actor.options(resources={"parent": 1}).remote()
a.start_child.remote(use_actors=use_actors)
# Wait for the child to be scheduled.
assert wait_for_condition(lambda: not child_resource_available())
wait_for_condition(lambda: not child_resource_available())
# Kill the parent process.
cluster.remove_node(node_to_kill, allow_graceful=False)
node_to_kill = cluster.add_node(num_cpus=1, resources={"parent": 1})
assert wait_for_condition(child_resource_available)
wait_for_condition(child_resource_available)
return node_to_kill

if node_failure:
Expand Down
2 changes: 1 addition & 1 deletion python/ray/tests/test_gcs_fault_tolerance.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def condition():
return False

# Wait for the removed node dead.
assert wait_for_condition(condition, timeout=10)
wait_for_condition(condition, timeout=10)


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions python/ray/tests/test_global_gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def check_refs_gced():
return (local_ref() is None and
not any(ray.get([a.has_garbage.remote() for a in actors])))

assert wait_for_condition(check_refs_gced)
wait_for_condition(check_refs_gced)
finally:
gc.enable()

Expand Down Expand Up @@ -105,7 +105,7 @@ def check_refs_gced():
return (local_ref() is None and
not any(ray.get([a.has_garbage.remote() for a in actors])))

assert wait_for_condition(check_refs_gced)
wait_for_condition(check_refs_gced)

# Local driver.
local_ref = weakref.ref(LargeObjectWithCyclicRef())
Expand All @@ -124,7 +124,7 @@ def check_refs_gced():
return (local_ref() is None and
not any(ray.get([a.has_garbage.remote() for a in actors])))

assert wait_for_condition(check_refs_gced)
wait_for_condition(check_refs_gced)
finally:
gc.enable()

Expand Down
2 changes: 1 addition & 1 deletion python/ray/tests/test_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def actor_finish():
else:
return False

assert wait_for_condition(actor_finish)
wait_for_condition(actor_finish)


def test_job_gc_with_detached_actor(call_ray_start):
Expand Down
35 changes: 16 additions & 19 deletions python/ray/tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ def f(arg):
stop_memory_table()
return True

def test_object_pineed_in_memory():
def test_object_pinned_in_memory():

a = ray.put(np.zeros(200 * 1024, dtype=np.uint8))
b = ray.get(a) # Noqa F841
Expand All @@ -469,8 +469,8 @@ def test_pending_task_references():
def f(arg):
time.sleep(1)

a = ray.put(np.zeros(200 * 1024, dtype=np.uint8)) # Noqa F841
b = f.remote(a) # Noqa F841
a = ray.put(np.zeros(200 * 1024, dtype=np.uint8))
b = f.remote(a)

wait_for_condition(memory_table_ready)
memory_table = get_memory_table()
Expand All @@ -491,7 +491,7 @@ def test_serialized_object_ref_reference():
def f(arg):
time.sleep(1)

a = ray.put(None) # Noqa F841
a = ray.put(None)
b = f.remote([a]) # Noqa F841

wait_for_condition(memory_table_ready)
Expand Down Expand Up @@ -550,30 +550,27 @@ class Actor:
# These tests should be retried because it takes at least one second
# to get the fresh new memory table. It is because memory table is updated
# Whenever raylet and node info is renewed which takes 1 second.
assert (wait_for_condition(
test_local_reference, timeout=30000, retry_interval_ms=1000) is True)
wait_for_condition(
test_local_reference, timeout=30000, retry_interval_ms=1000)

assert (wait_for_condition(
test_object_pineed_in_memory, timeout=30000, retry_interval_ms=1000) is
True)
wait_for_condition(
test_object_pinned_in_memory, timeout=30000, retry_interval_ms=1000)

assert (wait_for_condition(
test_pending_task_references, timeout=30000, retry_interval_ms=1000) is
True)
wait_for_condition(
test_pending_task_references, timeout=30000, retry_interval_ms=1000)

assert (wait_for_condition(
wait_for_condition(
test_serialized_object_ref_reference,
timeout=30000,
retry_interval_ms=1000) is True)
retry_interval_ms=1000)

assert (wait_for_condition(
wait_for_condition(
test_captured_object_ref_reference,
timeout=30000,
retry_interval_ms=1000) is True)
retry_interval_ms=1000)

assert (wait_for_condition(
test_actor_handle_reference, timeout=30000, retry_interval_ms=1000) is
True)
wait_for_condition(
test_actor_handle_reference, timeout=30000, retry_interval_ms=1000)


"""Memory Table Unit Test"""
Expand Down
5 changes: 3 additions & 2 deletions python/ray/tests/test_reconstruction.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def dependent_task(x):
cluster.remove_node(node_to_kill, allow_graceful=False)
cluster.add_node(
num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
assert wait_for_condition(
wait_for_condition(
lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10)

for _ in range(20):
Expand Down Expand Up @@ -101,7 +101,7 @@ def dependent_task(x):
cluster.remove_node(node_to_kill, allow_graceful=False)
cluster.add_node(
num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
assert wait_for_condition(
wait_for_condition(
lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10)

for _ in range(20):
Expand Down Expand Up @@ -296,6 +296,7 @@ def dependent_task(x):
pid = ray.get(a.pid.remote())


@pytest.mark.skipif(sys.platform == "win32", reason="Test failing on Windows.")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Skipping because I saw this fail with the following error (not sure if it's just Windows or everywhere, but I only saw it on Windows).

2020-07-25T22:46:38.3944180Z ================================== FAILURES ===================================
2020-07-25T22:46:38.3944438Z _____________ test_basic_reconstruction_actor_constructor[False] ______________
2020-07-25T22:46:38.3945545Z 
2020-07-25T22:46:38.3945858Z ray_start_cluster = <ray.cluster_utils.Cluster object at 0x000001D23CBE3688>
2020-07-25T22:46:38.3946140Z reconstruction_enabled = False
2020-07-25T22:46:38.3946349Z 
2020-07-25T22:46:38.3949025Z     @pytest.mark.parametrize("reconstruction_enabled", [False, True])
2020-07-25T22:46:38.3949534Z     def test_basic_reconstruction_actor_constructor(ray_start_cluster,
2020-07-25T22:46:38.3949846Z                                                     reconstruction_enabled):
2020-07-25T22:46:38.3950119Z         config = {
2020-07-25T22:46:38.3950381Z             "num_heartbeats_timeout": 10,
2020-07-25T22:46:38.3950710Z             "raylet_heartbeat_timeout_milliseconds": 100,
2020-07-25T22:46:38.3951098Z             "initial_reconstruction_timeout_milliseconds": 200,
2020-07-25T22:46:38.3951373Z         }
2020-07-25T22:46:38.3951622Z         # Workaround to reset the config to the default value.
2020-07-25T22:46:38.3951885Z         if not reconstruction_enabled:
2020-07-25T22:46:38.3952132Z             config["lineage_pinning_enabled"] = 0
2020-07-25T22:46:38.3952413Z         config = json.dumps(config)
2020-07-25T22:46:38.3952605Z     
2020-07-25T22:46:38.3952836Z         cluster = ray_start_cluster
2020-07-25T22:46:38.3953076Z         # Head node with no resources.
2020-07-25T22:46:38.3953315Z         cluster.add_node(
2020-07-25T22:46:38.3953549Z             num_cpus=0,
2020-07-25T22:46:38.3953786Z             _internal_config=config,
2020-07-25T22:46:38.3954065Z             enable_object_reconstruction=reconstruction_enabled)
2020-07-25T22:46:38.3954314Z         ray.init(address=cluster.address)
2020-07-25T22:46:38.3954594Z         # Node to place the initial object.
2020-07-25T22:46:38.3954833Z         node_to_kill = cluster.add_node(
2020-07-25T22:46:38.3955085Z             num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
2020-07-25T22:46:38.3955333Z         cluster.add_node(
2020-07-25T22:46:38.3955584Z             num_cpus=1, resources={"node2": 1}, object_store_memory=10**8)
2020-07-25T22:46:38.3955834Z         cluster.wait_for_nodes()
2020-07-25T22:46:38.3956064Z     
2020-07-25T22:46:38.3956301Z         @ray.remote(max_retries=1 if reconstruction_enabled else 0)
2020-07-25T22:46:38.3956546Z         def large_object():
2020-07-25T22:46:38.3956918Z             return np.zeros(10**7, dtype=np.uint8)
2020-07-25T22:46:38.3957152Z     
2020-07-25T22:46:38.3957397Z         # Both the constructor and a method depend on the large object.
2020-07-25T22:46:38.3957650Z         @ray.remote(max_restarts=-1)
2020-07-25T22:46:38.3957888Z         class Actor:
2020-07-25T22:46:38.3958124Z             def __init__(self, x):
2020-07-25T22:46:38.3958356Z                 pass
2020-07-25T22:46:38.3958584Z     
2020-07-25T22:46:38.3958818Z             def dependent_task(self, x):
2020-07-25T22:46:38.3959066Z                 return
2020-07-25T22:46:38.3959251Z     
2020-07-25T22:46:38.3959481Z             def pid(self):
2020-07-25T22:46:38.3959719Z                 return os.getpid()
2020-07-25T22:46:38.3959954Z     
2020-07-25T22:46:38.3960195Z         obj = large_object.options(resources={"node1": 1}).remote()
2020-07-25T22:46:38.3960449Z         a = Actor.options(resources={"node1": 1}).remote(obj)
2020-07-25T22:46:38.3960886Z         ray.get(a.dependent_task.remote(obj))
2020-07-25T22:46:38.3961152Z         pid = ray.get(a.pid.remote())
2020-07-25T22:46:38.3961385Z     
2020-07-25T22:46:38.3961630Z         # Workaround to kill the actor process too since there is a bug where the
2020-07-25T22:46:38.3961892Z         # actor's plasma client hangs after the plasma store has exited.
2020-07-25T22:46:38.3962141Z         os.kill(pid, SIGKILL)
2020-07-25T22:46:38.3962374Z     
2020-07-25T22:46:38.3962618Z         cluster.remove_node(node_to_kill, allow_graceful=False)
2020-07-25T22:46:38.3962877Z         cluster.add_node(
2020-07-25T22:46:38.3963214Z             num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
2020-07-25T22:46:38.3963467Z     
2020-07-25T22:46:38.3963697Z         wait_for_pid_to_exit(pid)
2020-07-25T22:46:38.3963929Z     
2020-07-25T22:46:38.3964164Z         # Wait for the actor to restart.
2020-07-25T22:46:38.3964398Z         def probe():
2020-07-25T22:46:38.3964632Z             try:
2020-07-25T22:46:38.3964883Z                 ray.get(a.dependent_task.remote(obj))
2020-07-25T22:46:38.3965128Z                 return True
2020-07-25T22:46:38.3965373Z             except ray.exceptions.RayActorError:
2020-07-25T22:46:38.3965804Z                 return False
2020-07-25T22:46:38.3966066Z             except (ray.exceptions.RayTaskError,
2020-07-25T22:46:38.3966337Z                     ray.exceptions.UnreconstructableError):
2020-07-25T22:46:38.3966600Z                 return True
2020-07-25T22:46:38.3968035Z     
2020-07-25T22:46:38.3968380Z >       wait_for_condition(probe)
2020-07-25T22:46:38.3968598Z 
2020-07-25T22:46:38.3968915Z \\?\C:\Users\RUNNER~1\AppData\Local\Temp\Bazel.runfiles_ymwnfkwp\runfiles\com_github_ray_project_ray\python\ray\tests\test_reconstruction.py:368: 
2020-07-25T22:46:38.3969222Z _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
2020-07-25T22:46:38.3971007Z 
2020-07-25T22:46:38.3971401Z condition_predictor = <function test_basic_reconstruction_actor_constructor.<locals>.probe at 0x000001D247491B88>
2020-07-25T22:46:38.3976318Z timeout = 30, retry_interval_ms = 100
2020-07-25T22:46:38.3976717Z 
2020-07-25T22:46:38.3977227Z     def wait_for_condition(condition_predictor, timeout=30, retry_interval_ms=100):
2020-07-25T22:46:38.3977562Z         """A helper function that waits until a condition is met.
2020-07-25T22:46:38.3977810Z     
2020-07-25T22:46:38.3978044Z         Args:
2020-07-25T22:46:38.3978298Z             condition_predictor: A function that predicts the condition.
2020-07-25T22:46:38.3978557Z             timeout: Maximum timeout in seconds.
2020-07-25T22:46:38.3978825Z             retry_interval_ms: Retry interval in milliseconds.
2020-07-25T22:46:38.3979065Z     
2020-07-25T22:46:38.3979294Z         Raises:
2020-07-25T22:46:38.3979546Z             RuntimeError: If the condition is not met before the timeout expires.
2020-07-25T22:46:38.3979793Z         """
2020-07-25T22:46:38.3980026Z         start = time.time()
2020-07-25T22:46:38.3980408Z         while time.time() - start <= timeout:
2020-07-25T22:46:38.3980758Z             if condition_predictor():
2020-07-25T22:46:38.3980999Z                 return
2020-07-25T22:46:38.3981846Z             time.sleep(retry_interval_ms / 1000.0)
2020-07-25T22:46:38.3982122Z >       raise RuntimeError("The condition wasn't met before the timeout expired.")
2020-07-25T22:46:38.3982393Z E       RuntimeError: The condition wasn't met before the timeout expired.
2020-07-25T22:46:38.3982557Z 
2020-07-25T22:46:38.3982804Z d:\a\ray\ray\python\ray\test_utils.py:257: RuntimeError
2020-07-25T22:46:38.3983072Z ---------------------------- Captured stdout call -----------------------------
2020-07-25T22:46:38.3983356Z (pid=3336) E0725 22:32:23.519379  3336  9104 direct_actor_transport.h:440] timed out waiting for 0, cancelling all queued tasks
2020-07-25T22:46:38.3983628Z ---------------------------- Captured stderr call -----------------------------
2020-07-25T22:46:38.3983964Z 2020-07-25 22:31:48,264	INFO resource_spec.py:223 -- Starting Ray with 4.39 GiB memory available for workers and up to 0.15 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-25T22:46:38.3984277Z 2020-07-25 22:31:48,494	INFO services.py:1193 -- View the Ray dashboard at localhost:8265
2020-07-25T22:46:38.3984569Z I0725 22:31:48.512190  9676  8628 global_state_accessor.cc:25] Redis server address = 10.1.0.4:6379, is test flag = 0
2020-07-25T22:46:38.3984846Z I0725 22:31:49.138216  9676  8628 redis_client.cc:146] RedisClient connected.
2020-07-25T22:46:38.3985208Z I0725 22:31:49.138216  9676  8628 redis_gcs_client.cc:88] RedisGcsClient Connected.
2020-07-25T22:46:38.3985585Z I0725 22:31:49.936249  9676  8628 service_based_gcs_client.cc:193] Reconnected to GCS server: 10.1.0.4:60043
2020-07-25T22:46:38.3985874Z I0725 22:31:49.937249  9676  8628 service_based_accessor.cc:91] Reestablishing subscription for job info.
2020-07-25T22:46:38.3986158Z I0725 22:31:49.937249  9676  8628 service_based_accessor.cc:401] Reestablishing subscription for actor info.
2020-07-25T22:46:38.3986447Z I0725 22:31:49.937249  9676  8628 service_based_accessor.cc:768] Reestablishing subscription for node info.
2020-07-25T22:46:38.3986920Z I0725 22:31:49.937249  9676  8628 service_based_accessor.cc:1040] Reestablishing subscription for task info.
2020-07-25T22:46:38.3987454Z I0725 22:31:49.937249  9676  8628 service_based_accessor.cc:1212] Reestablishing subscription for object locations.
2020-07-25T22:46:38.3988060Z I0725 22:31:49.937249  9676  8628 service_based_accessor.cc:1323] Reestablishing subscription for worker failures.
2020-07-25T22:46:38.3988576Z I0725 22:31:49.937249  9676  8628 service_based_gcs_client.cc:86] ServiceBasedGcsClient Connected.
2020-07-25T22:46:38.3988880Z I0725 22:31:50.047255  9676  8628 global_state_accessor.cc:25] Redis server address = 10.1.0.4:6379, is test flag = 0
2020-07-25T22:46:38.3989165Z I0725 22:31:50.049257  9676  8628 redis_client.cc:146] RedisClient connected.
2020-07-25T22:46:38.3989446Z I0725 22:31:50.049257  9676  8628 redis_gcs_client.cc:88] RedisGcsClient Connected.
2020-07-25T22:46:38.3989733Z I0725 22:31:50.050254  9676  8628 service_based_gcs_client.cc:193] Reconnected to GCS server: 10.1.0.4:60043
2020-07-25T22:46:38.3990021Z I0725 22:31:50.051254  9676  8628 service_based_accessor.cc:91] Reestablishing subscription for job info.
2020-07-25T22:46:38.3990305Z I0725 22:31:50.051254  9676  8628 service_based_accessor.cc:401] Reestablishing subscription for actor info.
2020-07-25T22:46:38.3990627Z I0725 22:31:50.051254  9676  8628 service_based_accessor.cc:768] Reestablishing subscription for node info.
2020-07-25T22:46:38.3990924Z I0725 22:31:50.051254  9676  8628 service_based_accessor.cc:1040] Reestablishing subscription for task info.
2020-07-25T22:46:38.3991210Z I0725 22:31:50.051254  9676  8628 service_based_accessor.cc:1212] Reestablishing subscription for object locations.
2020-07-25T22:46:38.3991500Z I0725 22:31:50.051254  9676  8628 service_based_accessor.cc:1323] Reestablishing subscription for worker failures.
2020-07-25T22:46:38.3991917Z I0725 22:31:50.051254  9676  8628 service_based_gcs_client.cc:86] ServiceBasedGcsClient Connected.
2020-07-25T22:46:38.3992286Z 2020-07-25 22:31:50,083	INFO resource_spec.py:223 -- Starting Ray with 4.74 GiB memory available for workers and up to 0.09 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-25T22:46:38.3992598Z 2020-07-25 22:31:50,207	WARNING worker.py:1128 -- The dashboard on node fv-az107 failed with the following error:
2020-07-25T22:46:38.3992856Z Traceback (most recent call last):
2020-07-25T22:46:38.3993117Z   File "d:\a\ray\ray\python\ray\dashboard/dashboard.py", line 960, in <module>
2020-07-25T22:46:38.3993383Z     metrics_export_address=metrics_export_address)
2020-07-25T22:46:38.3993643Z   File "d:\a\ray\ray\python\ray\dashboard/dashboard.py", line 513, in __init__
2020-07-25T22:46:38.3993914Z     build_dir = setup_static_dir(self.app)
2020-07-25T22:46:38.3994175Z   File "d:\a\ray\ray\python\ray\dashboard/dashboard.py", line 414, in setup_static_dir
2020-07-25T22:46:38.3994610Z     "&& npm run build)", build_dir)
2020-07-25T22:46:38.3995125Z FileNotFoundError: [Errno 2] Dashboard build directory not found. If installing from source, please follow the additional steps required to build the dashboard(cd python/ray/dashboard/client && npm ci && npm run build): 'd:\\a\\ray\\ray\\python\\ray\\dashboard\\client/build'
2020-07-25T22:46:38.3995401Z 
2020-07-25T22:46:38.3995786Z 2020-07-25 22:31:50,590	INFO resource_spec.py:223 -- Starting Ray with 4.74 GiB memory available for workers and up to 0.09 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-25T22:46:38.3996164Z 2020-07-25 22:31:51,818	INFO resource_spec.py:223 -- Starting Ray with 4.64 GiB memory available for workers and up to 0.09 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-25T22:46:38.3996517Z 2020-07-25 22:31:52,844	WARNING worker.py:1128 -- The node with node id b2be5fb8a027fac3b4d34a6d0cb4b450a02b1f4a has been marked dead because the detector has missed too many heartbeats from it.
2020-07-25T22:46:38.3996996Z E0725 22:31:53.412569  9676  7384 task_manager.cc:323] Task failed: IOError: 14: failed to connect to all addresses: Type=ACTOR_TASK, Language=PYTHON, Resources: {}, function_descriptor={type=PythonFunctionDescriptor, module_name=com_github_ray_project_ray.python.ray.tests.test_reconstruction, class_name=Actor, function_name=dependent_task, function_hash=}, task_id=04df2a355e22995fe725323c0100, job_id=0100, num_args=2, num_returns=2, actor_task_spec={actor_id=e725323c0100, actor_caller_id=ffffffffffffffffffffffff0100, actor_counter=2}
2020-07-25T22:46:38.3997572Z E0725 22:32:23.519379  9676  7384 task_manager.cc:323] Task failed: IOError: 2: client cancelled stale rpc: Type=ACTOR_TASK, Language=PYTHON, Resources: {}, function_descriptor={type=PythonFunctionDescriptor, module_name=com_github_ray_project_ray.python.ray.tests.test_reconstruction, class_name=Actor, function_name=dependent_task, function_hash=}, task_id=a438aa430f188597e725323c0100, job_id=0100, num_args=2, num_returns=2, actor_task_spec={actor_id=e725323c0100, actor_caller_id=ffffffffffffffffffffffff0100, actor_counter=3}
2020-07-25T22:46:38.3997969Z ______________ test_basic_reconstruction_actor_constructor[True] ______________
2020-07-25T22:46:38.3998195Z 
2020-07-25T22:46:38.3998493Z ray_start_cluster = <ray.cluster_utils.Cluster object at 0x000001D247477708>
2020-07-25T22:46:38.3998775Z reconstruction_enabled = True
2020-07-25T22:46:38.3998943Z 
2020-07-25T22:46:38.3999207Z     @pytest.mark.parametrize("reconstruction_enabled", [False, True])
2020-07-25T22:46:38.3999490Z     def test_basic_reconstruction_actor_constructor(ray_start_cluster,
2020-07-25T22:46:38.3999777Z                                                     reconstruction_enabled):
2020-07-25T22:46:38.4000103Z         config = {
2020-07-25T22:46:38.4000360Z             "num_heartbeats_timeout": 10,
2020-07-25T22:46:38.4000672Z             "raylet_heartbeat_timeout_milliseconds": 100,
2020-07-25T22:46:38.4000953Z             "initial_reconstruction_timeout_milliseconds": 200,
2020-07-25T22:46:38.4001257Z         }
2020-07-25T22:46:38.4001574Z         # Workaround to reset the config to the default value.
2020-07-25T22:46:38.4001842Z         if not reconstruction_enabled:
2020-07-25T22:46:38.4002108Z             config["lineage_pinning_enabled"] = 0
2020-07-25T22:46:38.4002367Z         config = json.dumps(config)
2020-07-25T22:46:38.4002626Z     
2020-07-25T22:46:38.4002877Z         cluster = ray_start_cluster
2020-07-25T22:46:38.4003137Z         # Head node with no resources.
2020-07-25T22:46:38.4003395Z         cluster.add_node(
2020-07-25T22:46:38.4003656Z             num_cpus=0,
2020-07-25T22:46:38.4003910Z             _internal_config=config,
2020-07-25T22:46:38.4004185Z             enable_object_reconstruction=reconstruction_enabled)
2020-07-25T22:46:38.4004458Z         ray.init(address=cluster.address)
2020-07-25T22:46:38.4004721Z         # Node to place the initial object.
2020-07-25T22:46:38.4004982Z         node_to_kill = cluster.add_node(
2020-07-25T22:46:38.4005255Z             num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
2020-07-25T22:46:38.4005524Z         cluster.add_node(
2020-07-25T22:46:38.4005796Z             num_cpus=1, resources={"node2": 1}, object_store_memory=10**8)
2020-07-25T22:46:38.4006062Z         cluster.wait_for_nodes()
2020-07-25T22:46:38.4006313Z     
2020-07-25T22:46:38.4006621Z         @ray.remote(max_retries=1 if reconstruction_enabled else 0)
2020-07-25T22:46:38.4006912Z         def large_object():
2020-07-25T22:46:38.4007174Z             return np.zeros(10**7, dtype=np.uint8)
2020-07-25T22:46:38.4007436Z     
2020-07-25T22:46:38.4007702Z         # Both the constructor and a method depend on the large object.
2020-07-25T22:46:38.4007976Z         @ray.remote(max_restarts=-1)
2020-07-25T22:46:38.4008233Z         class Actor:
2020-07-25T22:46:38.4008490Z             def __init__(self, x):
2020-07-25T22:46:38.4008741Z                 pass
2020-07-25T22:46:38.4008992Z     
2020-07-25T22:46:38.4009248Z             def dependent_task(self, x):
2020-07-25T22:46:38.4009501Z                 return
2020-07-25T22:46:38.4009750Z     
2020-07-25T22:46:38.4009999Z             def pid(self):
2020-07-25T22:46:38.4010257Z                 return os.getpid()
2020-07-25T22:46:38.4010506Z     
2020-07-25T22:46:38.4011262Z         obj = large_object.options(resources={"node1": 1}).remote()
2020-07-25T22:46:38.4011550Z         a = Actor.options(resources={"node1": 1}).remote(obj)
2020-07-25T22:46:38.4011828Z         ray.get(a.dependent_task.remote(obj))
2020-07-25T22:46:38.4012087Z         pid = ray.get(a.pid.remote())
2020-07-25T22:46:38.4012444Z     
2020-07-25T22:46:38.4012694Z         # Workaround to kill the actor process too since there is a bug where the
2020-07-25T22:46:38.4013510Z         # actor's plasma client hangs after the plasma store has exited.
2020-07-25T22:46:38.4013787Z         os.kill(pid, SIGKILL)
2020-07-25T22:46:38.4014030Z     
2020-07-25T22:46:38.4014276Z         cluster.remove_node(node_to_kill, allow_graceful=False)
2020-07-25T22:46:38.4014581Z         cluster.add_node(
2020-07-25T22:46:38.4015059Z             num_cpus=1, resources={"node1": 1}, object_store_memory=10**8)
2020-07-25T22:46:38.4015325Z     
2020-07-25T22:46:38.4015579Z         wait_for_pid_to_exit(pid)
2020-07-25T22:46:38.4015828Z     
2020-07-25T22:46:38.4016085Z         # Wait for the actor to restart.
2020-07-25T22:46:38.4016362Z         def probe():
2020-07-25T22:46:38.4016612Z             try:
2020-07-25T22:46:38.4016870Z                 ray.get(a.dependent_task.remote(obj))
2020-07-25T22:46:38.4017134Z                 return True
2020-07-25T22:46:38.4017403Z             except ray.exceptions.RayActorError:
2020-07-25T22:46:38.4018474Z                 return False
2020-07-25T22:46:38.4018729Z             except (ray.exceptions.RayTaskError,
2020-07-25T22:46:38.4018990Z                     ray.exceptions.UnreconstructableError):
2020-07-25T22:46:38.4019241Z                 return True
2020-07-25T22:46:38.4019475Z     
2020-07-25T22:46:38.4019709Z >       wait_for_condition(probe)
2020-07-25T22:46:38.4019864Z 
2020-07-25T22:46:38.4020151Z \\?\C:\Users\RUNNER~1\AppData\Local\Temp\Bazel.runfiles_ymwnfkwp\runfiles\com_github_ray_project_ray\python\ray\tests\test_reconstruction.py:368: 
2020-07-25T22:46:38.4020448Z _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
2020-07-25T22:46:38.4020723Z 
2020-07-25T22:46:38.4020996Z condition_predictor = <function test_basic_reconstruction_actor_constructor.<locals>.probe at 0x000001D2474DCEE8>
2020-07-25T22:46:38.4021265Z timeout = 30, retry_interval_ms = 100
2020-07-25T22:46:38.4021460Z 
2020-07-25T22:46:38.4021717Z     def wait_for_condition(condition_predictor, timeout=30, retry_interval_ms=100):
2020-07-25T22:46:38.4021988Z         """A helper function that waits until a condition is met.
2020-07-25T22:46:38.4022453Z     
2020-07-25T22:46:38.4022698Z         Args:
2020-07-25T22:46:38.4022970Z             condition_predictor: A function that predicts the condition.
2020-07-25T22:46:38.4023254Z             timeout: Maximum timeout in seconds.
2020-07-25T22:46:38.4023529Z             retry_interval_ms: Retry interval in milliseconds.
2020-07-25T22:46:38.4023794Z     
2020-07-25T22:46:38.4024040Z         Raises:
2020-07-25T22:46:38.4024382Z             RuntimeError: If the condition is not met before the timeout expires.
2020-07-25T22:46:38.4024659Z         """
2020-07-25T22:46:38.4024913Z         start = time.time()
2020-07-25T22:46:38.4025181Z         while time.time() - start <= timeout:
2020-07-25T22:46:38.4025473Z             if condition_predictor():
2020-07-25T22:46:38.4025732Z                 return
2020-07-25T22:46:38.4026004Z             time.sleep(retry_interval_ms / 1000.0)
2020-07-25T22:46:38.4026281Z >       raise RuntimeError("The condition wasn't met before the timeout expired.")
2020-07-25T22:46:38.4026566Z E       RuntimeError: The condition wasn't met before the timeout expired.
2020-07-25T22:46:38.4026786Z 
2020-07-25T22:46:38.4027046Z d:\a\ray\ray\python\ray\test_utils.py:257: RuntimeError
2020-07-25T22:46:38.4027331Z ---------------------------- Captured stdout call -----------------------------
2020-07-25T22:46:38.4027689Z (pid=5324) E0725 22:33:02.847265  5324  8772 direct_actor_transport.h:440] timed out waiting for 0, cancelling all queued tasks
2020-07-25T22:46:38.4027995Z ---------------------------- Captured stderr call -----------------------------
2020-07-25T22:46:38.4028340Z 2020-07-25 22:32:27,542	INFO resource_spec.py:223 -- Starting Ray with 4.39 GiB memory available for workers and up to 0.15 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-25T22:46:38.4028672Z 2020-07-25 22:32:27,768	INFO services.py:1193 -- View the Ray dashboard at localhost:8265
2020-07-25T22:46:38.4028981Z I0725 22:32:27.786980  9676  8628 global_state_accessor.cc:25] Redis server address = 10.1.0.4:6379, is test flag = 0
2020-07-25T22:46:38.4029282Z I0725 22:32:28.076995  9676  8628 redis_client.cc:146] RedisClient connected.
2020-07-25T22:46:38.4029570Z I0725 22:32:28.076995  9676  8628 redis_gcs_client.cc:88] RedisGcsClient Connected.
2020-07-25T22:46:38.4029877Z I0725 22:32:29.153057  9676  8628 service_based_gcs_client.cc:193] Reconnected to GCS server: 10.1.0.4:60314
2020-07-25T22:46:38.4030229Z I0725 22:32:29.153057  9676  8628 service_based_accessor.cc:91] Reestablishing subscription for job info.
2020-07-25T22:46:38.4030580Z I0725 22:32:29.153057  9676  8628 service_based_accessor.cc:401] Reestablishing subscription for actor info.
2020-07-25T22:46:38.4031145Z I0725 22:32:29.153057  9676  8628 service_based_accessor.cc:768] Reestablishing subscription for node info.
2020-07-25T22:46:38.4031538Z I0725 22:32:29.153057  9676  8628 service_based_accessor.cc:1040] Reestablishing subscription for task info.
2020-07-25T22:46:38.4031852Z I0725 22:32:29.153057  9676  8628 service_based_accessor.cc:1212] Reestablishing subscription for object locations.
2020-07-25T22:46:38.4032163Z I0725 22:32:29.153057  9676  8628 service_based_accessor.cc:1323] Reestablishing subscription for worker failures.
2020-07-25T22:46:38.4032468Z I0725 22:32:29.153057  9676  8628 service_based_gcs_client.cc:86] ServiceBasedGcsClient Connected.
2020-07-25T22:46:38.4032783Z I0725 22:32:29.263064  9676  8628 global_state_accessor.cc:25] Redis server address = 10.1.0.4:6379, is test flag = 0
2020-07-25T22:46:38.4033076Z I0725 22:32:29.265064  9676  8628 redis_client.cc:146] RedisClient connected.
2020-07-25T22:46:38.4033362Z I0725 22:32:29.265064  9676  8628 redis_gcs_client.cc:88] RedisGcsClient Connected.
2020-07-25T22:46:38.4033754Z I0725 22:32:29.267064  9676  8628 service_based_gcs_client.cc:193] Reconnected to GCS server: 10.1.0.4:60314
2020-07-25T22:46:38.4034063Z I0725 22:32:29.267064  9676  8628 service_based_accessor.cc:91] Reestablishing subscription for job info.
2020-07-25T22:46:38.4034363Z I0725 22:32:29.267064  9676  8628 service_based_accessor.cc:401] Reestablishing subscription for actor info.
2020-07-25T22:46:38.4034660Z I0725 22:32:29.267064  9676  8628 service_based_accessor.cc:768] Reestablishing subscription for node info.
2020-07-25T22:46:38.4034966Z I0725 22:32:29.267064  9676  8628 service_based_accessor.cc:1040] Reestablishing subscription for task info.
2020-07-25T22:46:38.4035345Z I0725 22:32:29.267064  9676  8628 service_based_accessor.cc:1212] Reestablishing subscription for object locations.
2020-07-25T22:46:38.4035714Z I0725 22:32:29.267064  9676  8628 service_based_accessor.cc:1323] Reestablishing subscription for worker failures.
2020-07-25T22:46:38.4036065Z I0725 22:32:29.267064  9676  8628 service_based_gcs_client.cc:86] ServiceBasedGcsClient Connected.
2020-07-25T22:46:38.4036521Z 2020-07-25 22:32:29,306	INFO resource_spec.py:223 -- Starting Ray with 4.74 GiB memory available for workers and up to 0.09 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-25T22:46:38.4036836Z 2020-07-25 22:32:29,583	WARNING worker.py:1128 -- The dashboard on node fv-az107 failed with the following error:
2020-07-25T22:46:38.4037094Z Traceback (most recent call last):
2020-07-25T22:46:38.4037353Z   File "d:\a\ray\ray\python\ray\dashboard/dashboard.py", line 960, in <module>
2020-07-25T22:46:38.4037613Z     metrics_export_address=metrics_export_address)
2020-07-25T22:46:38.4037876Z   File "d:\a\ray\ray\python\ray\dashboard/dashboard.py", line 513, in __init__
2020-07-25T22:46:38.4038136Z     build_dir = setup_static_dir(self.app)
2020-07-25T22:46:38.4038397Z   File "d:\a\ray\ray\python\ray\dashboard/dashboard.py", line 414, in setup_static_dir
2020-07-25T22:46:38.4038660Z     "&& npm run build)", build_dir)
2020-07-25T22:46:38.4038996Z FileNotFoundError: [Errno 2] Dashboard build directory not found. If installing from source, please follow the additional steps required to build the dashboard(cd python/ray/dashboard/client && npm ci && npm run build): 'd:\\a\\ray\\ray\\python\\ray\\dashboard\\client/build'
2020-07-25T22:46:38.4039260Z 
2020-07-25T22:46:38.4039550Z 2020-07-25 22:32:29,866	INFO resource_spec.py:223 -- Starting Ray with 4.74 GiB memory available for workers and up to 0.09 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-25T22:46:38.4039889Z 2020-07-25 22:32:31,047	INFO resource_spec.py:223 -- Starting Ray with 4.64 GiB memory available for workers and up to 0.09 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-25T22:46:38.4040347Z E0725 22:32:32.081213  9676  7832 core_worker.cc:394] Resubmitting task that produced lost plasma object: Type=NORMAL_TASK, Language=PYTHON, Resources: {CPU: 1, node1: 1, }, function_descriptor={type=PythonFunctionDescriptor, module_name=com_github_ray_project_ray.python.ray.tests.test_reconstruction, class_name=, function_name=large_object, function_hash=98a9ea3de8afac8e008142a048452b8f4baff8e6}, task_id=6384c128cdbeca96ffffffff0100, job_id=0100, num_args=0, num_returns=1
2020-07-25T22:46:38.4040834Z 2020-07-25 22:32:32,084	WARNING worker.py:1128 -- The node with node id 9a2181e97f837ff55051eb873983d9a8b906df48 has been marked dead because the detector has missed too many heartbeats from it.
2020-07-25T22:46:38.4041277Z E0725 22:32:32.742575  9676  7832 task_manager.cc:323] Task failed: IOError: 14: failed to connect to all addresses: Type=ACTOR_TASK, Language=PYTHON, Resources: {}, function_descriptor={type=PythonFunctionDescriptor, module_name=com_github_ray_project_ray.python.ray.tests.test_reconstruction, class_name=Actor, function_name=dependent_task, function_hash=}, task_id=f82735b7e03dcb6af3f4c2d10100, job_id=0100, num_args=2, num_returns=2, actor_task_spec={actor_id=f3f4c2d10100, actor_caller_id=ffffffffffffffffffffffff0100, actor_counter=2}
2020-07-25T22:46:38.4041851Z E0725 22:33:02.847265  9676  7832 task_manager.cc:323] Task failed: IOError: 2: client cancelled stale rpc: Type=ACTOR_TASK, Language=PYTHON, Resources: {}, function_descriptor={type=PythonFunctionDescriptor, module_name=com_github_ray_project_ray.python.ray.tests.test_reconstruction, class_name=Actor, function_name=dependent_task, function_hash=}, task_id=aa33b34bb97c67d1f3f4c2d10100, job_id=0100, num_args=2, num_returns=2, actor_task_spec={actor_id=f3f4c2d10100, actor_caller_id=ffffffffffffffffffffffff0100, actor_counter=3}

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No apparent connection to Windows to me, but not sure.

@pytest.mark.parametrize("reconstruction_enabled", [False, True])
def test_basic_reconstruction_actor_constructor(ray_start_cluster,
reconstruction_enabled):
Expand Down
2 changes: 1 addition & 1 deletion python/ray/tests/test_reference_counting_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def ref_not_exists():
inner_oid = ray.ObjectRef(inner_oid_binary)
return not worker.core_worker.object_exists(inner_oid)

assert wait_for_condition(ref_not_exists)
wait_for_condition(ref_not_exists)


# Call a recursive chain of tasks that pass a serialized reference that was
Expand Down