Skip to content

Commit

Permalink
[serve] Restore "Get new handle to controller if killed" (ray-project…
Browse files Browse the repository at this point in the history
…#23283) (ray-project#23338)

ray-project#23336 reverted ray-project#23283. ray-project#23283 did pass CI before merging. However, when it merged, it began to fail because it used commands that were outdated on the Master branch in `test_cli.py` (specifically `serve info` instead of `serve config`). This change restores ray-project#23283 and updates its tests commands.
  • Loading branch information
shrekris-anyscale authored Mar 18, 2022
1 parent 5ab634f commit c668039
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 5 deletions.
1 change: 1 addition & 0 deletions dashboard/optional_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ async def decorator(self, *args, **kwargs):

if connect_to_serve:
serve.start(detached=True, _override_controller_namespace="serve")

return await f(self, *args, **kwargs)
except Exception as e:
logger.exception(f"Unexpected error in handler: {e}")
Expand Down
31 changes: 26 additions & 5 deletions python/ray/serve/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)

from fastapi import APIRouter, FastAPI
from ray.exceptions import RayActorError
from ray.experimental.dag.class_node import ClassNode
from ray.experimental.dag.function_node import FunctionNode
from starlette.requests import Request
Expand Down Expand Up @@ -82,7 +83,7 @@


_INTERNAL_REPLICA_CONTEXT = None
_global_client = None
_global_client: "Client" = None

_UUID_RE = re.compile(
"[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89aAbB][a-f0-9]{3}-[a-f0-9]{12}"
Expand Down Expand Up @@ -118,9 +119,28 @@ def _get_controller_namespace(
return controller_namespace


def internal_get_global_client(_override_controller_namespace: Optional[str] = None):
if _global_client is not None:
return _global_client
def internal_get_global_client(
_override_controller_namespace: Optional[str] = None,
_health_check_controller: bool = False,
) -> "Client":
"""Gets the global client, which stores the controller's handle.
Args:
_override_controller_namespace (Optional[str]): If None and there's no
cached client, searches for the controller in this namespace.
_health_check_controller (bool): If True, run a health check on the
cached controller if it exists. If the check fails, try reconnecting
to the controller.
"""

try:
if _global_client is not None:
if _health_check_controller:
ray.get(_global_client._controller.check_alive.remote())
return _global_client
except RayActorError:
logger.info("The cached controller has died. Reconnecting.")
_set_global_client(None)

return _connect(_override_controller_namespace=_override_controller_namespace)

Expand Down Expand Up @@ -711,7 +731,8 @@ def start(

try:
client = internal_get_global_client(
_override_controller_namespace=_override_controller_namespace
_override_controller_namespace=_override_controller_namespace,
_health_check_controller=True,
)
logger.info(
"Connecting to existing Serve instance in namespace "
Expand Down
4 changes: 4 additions & 0 deletions python/ray/serve/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ async def __init__(

asyncio.get_event_loop().create_task(self.run_control_loop())

def check_alive(self) -> None:
"""No-op to check if this controller is alive."""
return

def record_autoscaling_metrics(self, data: Dict[str, float], send_timestamp: float):
self.autoscaling_metrics_store.add_metrics_point(data, send_timestamp)

Expand Down
39 changes: 39 additions & 0 deletions python/ray/serve/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def test_deploy(ray_start_stop):
== "Hello shallow world!"
)

serve.shutdown()
ray.shutdown()


Expand Down Expand Up @@ -371,5 +372,43 @@ def test_run_runtime_env(ray_start_stop):
p.wait()


@pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.")
@pytest.mark.parametrize("use_command", [True, False])
def test_idempotence_after_controller_death(ray_start_stop, use_command: bool):
"""Check that CLI is idempotent even if controller dies."""

config_file_name = os.path.join(
os.path.dirname(__file__), "test_config_files", "two_deployments.yaml"
)
success_message_fragment = b"Sent deploy request successfully!"
deploy_response = subprocess.check_output(["serve", "deploy", config_file_name])
assert success_message_fragment in deploy_response

ray.init(address="auto", namespace="serve")
serve.start(detached=True)
assert len(serve.list_deployments()) == 2

# Kill controller
if use_command:
subprocess.check_output(["serve", "shutdown"])
else:
serve.shutdown()

info_response = subprocess.check_output(["serve", "config"])
info = yaml.safe_load(info_response)

assert "deployments" in info
assert len(info["deployments"]) == 0

deploy_response = subprocess.check_output(["serve", "deploy", config_file_name])
assert success_message_fragment in deploy_response

# Restore testing controller
serve.start(detached=True)
assert len(serve.list_deployments()) == 2
serve.shutdown()
ray.shutdown()


if __name__ == "__main__":
sys.exit(pytest.main(["-v", "-s", __file__]))
38 changes: 38 additions & 0 deletions python/ray/serve/tests/test_standalone2.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import sys

import pytest
from ray.exceptions import RayActorError
import requests

import ray
from ray import serve
from ray.serve.api import internal_get_global_client
from ray._private.test_utils import wait_for_condition


@pytest.fixture
Expand Down Expand Up @@ -75,5 +77,41 @@ def f(*args):
serve.shutdown()


@pytest.mark.parametrize("detached", [True, False])
def test_refresh_controller_after_death(shutdown_ray, detached):
"""Check if serve.start() refreshes the controller handle if it's dead."""

ray_namespace = "ray_namespace"
controller_namespace = "controller_namespace"

ray.init(namespace=ray_namespace)
serve.shutdown() # Ensure serve isn't running before beginning the test
serve.start(detached=detached, _override_controller_namespace=controller_namespace)

old_handle = internal_get_global_client()._controller
ray.kill(old_handle, no_restart=True)

def controller_died(handle):
try:
ray.get(handle.check_alive.remote())
return False
except RayActorError:
return True

wait_for_condition(controller_died, handle=old_handle, timeout=15)

# Call start again to refresh handle
serve.start(detached=detached, _override_controller_namespace=controller_namespace)

new_handle = internal_get_global_client()._controller
assert new_handle is not old_handle

# Health check should not error
ray.get(new_handle.check_alive.remote())

serve.shutdown()
ray.shutdown()


if __name__ == "__main__":
sys.exit(pytest.main(["-v", "-s", __file__]))

0 comments on commit c668039

Please sign in to comment.