Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Serve] Revisiting ProxyState to fix draining sequence #41722

Merged
merged 29 commits into from
Jan 22, 2024
Merged
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
39c1447
Merge `start_new_ready_check` into `is_ready`
alexeykudinkin Dec 7, 2023
0231ccf
Cleaned up `try_update_status`;
alexeykudinkin Dec 7, 2023
0ebfe1b
Streamlined `is_healthy`, `is_ready` checks in `ProxyActorWrapper`
alexeykudinkin Dec 7, 2023
b6b3660
Tidying up
alexeykudinkin Dec 7, 2023
feffe80
Simplified `is_healthy`, `is_ready` checks even more
alexeykudinkin Dec 7, 2023
2144898
Fixed `is_drained` check to properly return whether proxy actor is pr…
alexeykudinkin Dec 7, 2023
d04d010
Streamlined proxy's state reconciliation flow
alexeykudinkin Dec 7, 2023
ab1cc40
Simplified readiness, health, drain checks;
alexeykudinkin Dec 8, 2023
5ba6068
Make sure exceptions are properly handled in proxy state reconciliation
alexeykudinkin Dec 8, 2023
d9ef34b
Cleaned up docs
alexeykudinkin Dec 8, 2023
e7c5e1e
Added test for ActorProxyWrapper
alexeykudinkin Dec 8, 2023
759d904
Fixed typos;
alexeykudinkin Dec 8, 2023
953343a
Fixed tests
alexeykudinkin Dec 8, 2023
73d881e
Fixed more tests
alexeykudinkin Dec 8, 2023
9e05832
Fixed some more tests
alexeykudinkin Dec 8, 2023
8d0cdcd
`lint`
alexeykudinkin Dec 8, 2023
136df1c
Fixing typo
alexeykudinkin Dec 8, 2023
189cad4
After rebase clean-up
alexeykudinkin Jan 10, 2024
c8dcb57
Tidying up
alexeykudinkin Jan 10, 2024
06b1755
Make proxy health-check configurable
alexeykudinkin Jan 11, 2024
f4283df
Unify `is_drained` API to require timeout be explicitly provided
alexeykudinkin Jan 11, 2024
b787188
Missing log statement
alexeykudinkin Jan 11, 2024
adfd2f3
Tidying up
alexeykudinkin Jan 18, 2024
eff6f23
Cleaned up exception handling
alexeykudinkin Jan 18, 2024
959feba
`_try_cancel_future` -> `_try_set_exception`
alexeykudinkin Jan 18, 2024
c5d2034
Make timeout_s optional
alexeykudinkin Jan 18, 2024
7002e11
Fixing typo
alexeykudinkin Jan 18, 2024
cc99b4a
Fixing another typo
alexeykudinkin Jan 18, 2024
8a1e954
Fixing tests
alexeykudinkin Jan 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fixed some more tests
Signed-off-by: Alexey Kudinkin <ak@anyscale.com>
  • Loading branch information
alexeykudinkin committed Jan 20, 2024
commit 9e05832f04c0c1fa19820df5e9056b5b27b6f891
91 changes: 48 additions & 43 deletions python/ray/serve/tests/test_proxy_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def test_node_selection(all_nodes):


@patch("ray.serve._private.proxy_state.PROXY_HEALTH_CHECK_PERIOD_S", 5)
def test_proxy_state_reconcile_restarts_unhealthy_proxies(all_nodes):
def test_proxy_state_manager_restarts_unhealthy_proxies(all_nodes):
"""Test the update method in ProxyStateManager would
kill and restart unhealthy proxies.
"""
Expand Down Expand Up @@ -629,9 +629,9 @@ def check_is_ready_for_shutdown():
wait_for_condition(check_is_ready_for_shutdown)


@patch("ray.serve._private.proxy_state.PROXY_READY_CHECK_TIMEOUT_S", 0.1)
@patch("ray.serve._private.proxy_state.PROXY_HEALTH_CHECK_UNHEALTHY_THRESHOLD", 1)
@pytest.mark.parametrize("number_of_worker_nodes", [1])
def test_proxy_starting_timeout_longer_than_env(number_of_worker_nodes, all_nodes):
def test_proxy_state_manager_timing_out_on_start(number_of_worker_nodes, all_nodes):
"""Test update method on ProxyStateManager when the proxy state is STARTING and
when the ready call takes longer than PROXY_READY_CHECK_TIMEOUT_S.

Expand All @@ -652,57 +652,62 @@ def test_proxy_starting_timeout_longer_than_env(number_of_worker_nodes, all_node

# Run update to create proxy states.
proxy_state_manager.update(proxy_nodes=node_ids)
old_proxy_states = {
node_id: state for node_id, state in proxy_state_manager._proxy_states.items()
}

# Ensure 2 proxies are created, one for the head node and another for the worker.
assert len(proxy_state_manager._proxy_states) == len(node_ids)

# Ensure the proxy state statuses before update are STARTING. Also, setting the
# ready call status to be pending to simulate the call never respond.
def check_proxy_state_starting(_proxy_state_manager: ProxyStateManager):
for proxy_state in _proxy_state_manager._proxy_states.values():
assert proxy_state.status == ProxyStatus.STARTING
proxy_state._actor_proxy_wrapper.is_ready_response = None

# Trigger update and ensure proxy states are restarted due to time advanced
# longer than PROXY_READY_CHECK_TIMEOUT_S of 0.1s.
check_proxy_state_starting(_proxy_state_manager=proxy_state_manager)
fake_time.advance(0.11)
proxy_state_manager.update(proxy_nodes=node_ids)
assert all(
[
proxy_state_manager._proxy_states[node_id] != old_proxy_states[node_id]
for node_id in node_ids
]
)
# Ensure the proxy state statuses before update are STARTING, set the
# readiness check to failing
for node_id in node_ids:
proxy_state = proxy_state_manager._proxy_states[node_id]

# Trigger another update with the same advance time and this time the
# proxy states should not be restarted again with the backoff timeout.
old_proxy_states = {
assert proxy_state.status == ProxyStatus.STARTING
proxy_state._actor_proxy_wrapper.is_ready_response = False

# Capture current proxy states (prior to updating)
prev_proxy_states = {
node_id: state for node_id, state in proxy_state_manager._proxy_states.items()
}
check_proxy_state_starting(_proxy_state_manager=proxy_state_manager)
fake_time.advance(0.11)

# Trigger PSM to reconcile
proxy_state_manager.update(proxy_nodes=node_ids)
assert all(
[
proxy_state_manager._proxy_states[node_id] == old_proxy_states[node_id]
for node_id in node_ids
]
)

# Ensure the proxy states turns healthy after the ready call is unblocked.
for proxy_state in proxy_state_manager._proxy_states.values():
# Ensure the proxy state statuses before update are STARTING, set the
# readiness check to failing
for node_id in node_ids:
proxy_state = proxy_state_manager._proxy_states[node_id]
prev_proxy_state = prev_proxy_states[node_id]
# Assert
# - All proxies are restarted
# - Previous proxy states are UNHEALTHY
# - New proxy states are STARTING
assert proxy_state != prev_proxy_state
assert prev_proxy_state.status == ProxyStatus.UNHEALTHY
assert proxy_state.status == ProxyStatus.STARTING
# Mark new proxy readiness checks as passing
proxy_state._actor_proxy_wrapper.is_ready_response = True

# Capture current proxy states again (prior to updating)
prev_proxy_states = {
node_id: state for node_id, state in proxy_state_manager._proxy_states.items()
}

# Trigger PSM to reconcile
proxy_state_manager.update(proxy_nodes=node_ids)
assert all(
[
proxy_state_manager._proxy_states[node_id].status == ProxyStatus.HEALTHY
for node_id in node_ids
]
)

# Ensure the proxy state statuses before update are STARTING, set the
# readiness check to failing
for node_id in node_ids:
proxy_state = proxy_state_manager._proxy_states[node_id]
prev_proxy_state = prev_proxy_states[node_id]
# Assert
# - All proxies are restarted
# - Previous proxy states are UNHEALTHY
# - New proxy states are STARTING
assert proxy_state == prev_proxy_state
assert prev_proxy_state.status == ProxyStatus.HEALTHY
assert proxy_state.status == ProxyStatus.HEALTHY



if __name__ == "__main__":
Expand Down