added test abort

vllm-project · robertgshaw2-redhat · Sep 18, 2024 · Aug 28, 2024 · Aug 29, 2024 · Aug 29, 2024
commit 5b3535d1188a7b2f6f59dbeb15096326e7b6c1b2
diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
@@ -19,7 +19,8 @@
 
 MODEL = "Qwen/Qwen2-0.5B-Instruct"
 ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
-RAISED_ERROR = KeyError("foo")
+RAISED_ERROR = KeyError
+RAISED_VALUE = "foo"
 
 
 @pytest.fixture(scope="function")
@@ -36,7 +37,8 @@ def run_with_evil_forward(engine_args: AsyncEngineArgs, ipc_path: str):
         ipc_path=ipc_path)
 
     # Raise error during first forward pass.
-    engine.engine.model_executor.execute_model = Mock(side_effect=RAISED_ERROR)
+    engine.engine.model_executor.execute_model = Mock(
+        side_effect=RAISED_ERROR(RAISED_VALUE))
 
     # Run engine.
     engine.start()
@@ -50,46 +52,32 @@ async def test_evil_forward(tmp_socket):
 
         client = await engine.make_client()
 
-        # Fast health probe.
-        fast_health_probe_task = asyncio.create_task(
-            client.run_check_health_loop(timeout=1.0))
-
         # Server should be healthy after initial probe.
         await asyncio.sleep(2.0)
         await client.check_health()
 
         # Throws an error in first forward pass.
-        try:
+        with pytest.raises(RAISED_ERROR):
             async for _ in client.generate(inputs="Hello my name is",
                                            sampling_params=SamplingParams(),
                                            request_id=uuid.uuid4()):
                 pass
-        except Exception as e:
-            # First exception should be a RAISED_ERROR
-            assert repr(e) == repr(RAISED_ERROR)
-            assert client.errored
+        assert client.errored
 
         # Engine is errored, should get ENGINE_DEAD_ERROR.
-        try:
+        with pytest.raises(MQEngineDeadError):
             async for _ in client.generate(inputs="Hello my name is",
                                            sampling_params=SamplingParams(),
                                            request_id=uuid.uuid4()):
                 pass
-        except Exception as e:
-            # Next exception should be an ENGINE_DEAD_ERROR
-            assert client.errored, "Client should be dead."
-            assert isinstance(e, MQEngineDeadError), (
-                "Engine should be dead and raise ENGINE_DEAD_ERROR")
+        assert client.errored
 
-        await asyncio.sleep(2.0)
-        try:
+        await asyncio.sleep(1.0)
+        with pytest.raises(RAISED_ERROR):
             await client.check_health()
-        except Exception as e:
-            assert repr(e) == repr(RAISED_ERROR), (
-                "Health check raise the original error.")
+        assert client.errored
 
-        # Cleanup
-        await fast_health_probe_task
+        # Shutdown.
         client.close()
 
 
@@ -120,25 +108,18 @@ async def test_failed_health_check(tmp_socket):
 
         # Health probe should throw RAISED_ERROR.
         await asyncio.sleep(10)
-        try:
+
+        with pytest.raises(RAISED_ERROR):
             await client.check_health()
-        except Exception as e:
-            assert client.errored, "Client should be dead."
-            assert repr(e) == repr(RAISED_ERROR), (
-                "Health check raise the original error.")
+        assert client.errored
 
         # Generate call should throw ENGINE_DEAD_ERROR
-        try:
+        with pytest.raises(MQEngineDeadError):
             async for _ in client.generate(inputs="Hello my name is",
                                            sampling_params=SamplingParams(),
                                            request_id=uuid.uuid4()):
                 pass
-        except Exception as e:
-            assert client.errored, "Client should be dead."
-            assert isinstance(e, MQEngineDeadError), (
-                "Engine should be dead and raise ENGINE_DEAD_ERROR")
 
-        # Cleanup
         client.close()
 
 
@@ -173,34 +154,26 @@ async def bad_abort_after_2s():
             await asyncio.sleep(2.0)
             await client.abort(request_id="foo")
 
-            # Immediately should trigger error.
-            try:
-                await client.check_health()
-            except Exception as e:
-                assert client.errored, "Client should be dead."
-                assert repr(e) == repr(RAISED_ERROR), (
-                    "Health check raise the original error.")
-
         # Trigger an abort in 2s from now.
         abort_task = asyncio.create_task(bad_abort_after_2s())
 
         # Exception in abort() will happen during this generation.
-        # This will kill the engine and should return ENGINE_DEAD_ERROR.
-        try:
+        # This will kill the engine and should return ENGINE_DEAD_ERROR
+        # with reference to the original KeyError("foo")
+        with pytest.raises(MQEngineDeadError) as execinfo:
             async for _ in client.generate(
                     inputs="Hello my name is",
                     sampling_params=SamplingParams(max_tokens=2000),
                     request_id=uuid.uuid4()):
                 pass
-        except Exception as e:
-            print(f"error is: {e}")
-            # Next exception should be an ENGINE_DEAD_ERROR
-            assert isinstance(e, MQEngineDeadError), (
-                "Engine should be dead and raise ENGINE_DEAD_ERROR")
-            assert client.errored
-
+        assert "KeyError" in repr(execinfo.value)
+        assert client.errored
         await abort_task
 
+        # This should raise the original error.
+        with pytest.raises(RAISED_ERROR):
+            await client.check_health()
+
         client.close()
 
 
@@ -211,31 +184,21 @@ async def test_bad_request(tmp_socket):
 
         client = await engine.make_client()
 
-        # This should fail, but not crash the server.
-        try:
-            print("calling first generate")
+        # Invalid request should fail, but not crash the server.
+        with pytest.raises(ValueError):
             async for _ in client.generate(inputs="Hello my name is",
                                            sampling_params=SamplingParams(),
                                            request_id="abcd-1",
                                            lora_request=LoRARequest(
-                                               "invalid-lora", 1,
-                                               "invalid-path")):
+                                               "invalid-lora", 1, "invalid-path")):
                 pass
-        except Exception as e:
-            print("got exception")
-            assert isinstance(e, ValueError), (
-                "Expected ValueError when a LoRARequest in llm_engine")
 
         # This request should be okay.
         async for _ in client.generate(inputs="Hello my name is",
                                        sampling_params=SamplingParams(),
                                        request_id="abcd-2"):
             pass
 
-        # Confirm server is still running.
-        await asyncio.sleep(10.)
-        await client.check_health()
-
         # Shutdown.
         client.close()
 

diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
@@ -71,4 +71,4 @@ def ENGINE_DEAD_ERROR(
 
     return MQEngineDeadError(
         "Engine loop is not running. Inspect the stacktrace to "
-        f"find the original error {repr(error)}.")
+        f"find the original error: {repr(error)}.")
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
@@ -165,10 +165,10 @@ async def run_check_health_loop(self, timeout: int):
             logger.debug("Shutting down MQLLMEngineClient check health loop.")
 
         except Exception as e:
-            self.raise_exception(e)
+            self._set_errored(e)
 
     async def run_output_handler_loop(self):
-        """Get RequestOutputs from Engine and stream to request Queues"""
+        """Get RequestOutputs from Engine and stream to Request Queues"""
 
         try:
             while True:
@@ -249,19 +249,15 @@ async def setup(self):
 
     def close(self):
         """Destroy the ZeroMQ Context."""
-        # Close all sockets associated with this context and
-        # then terminate the context.
-        self.output_socket.close()
-        self.input_socket.close()
-        self.health_socket.close()
+        # Close all sockets and terminate the context.
         self.context.destroy(linger=0)
 
         # Cancel background tasks.
         if self.health_loop is not None:
             self.health_loop.cancel()
         self.output_loop.cancel()
 
-    def raise_exception(self, e: BaseException):
+    def _set_errored(self, e: BaseException):
         logger.exception(repr(e))
         if self._errored_with is None:
             self._errored_with = e
@@ -285,35 +281,26 @@ async def _send_get_data_rpc_request(request: RPCStartupRequest,
         frame = await socket.recv(copy=False)
         data = pickle.loads(frame.buffer)
 
-        if isinstance(data, Exception):
-            # Re-raise exceptions returned by the server
+        if isinstance(data, BaseException):
             raise data
-
-        if not isinstance(data, expected_type):
-            # LoRAConfig can be None.
-            if expected_type == LoRAConfig and data is None:
-                pass
-            elif isinstance(data, Exception):
-                logger.error(error_message)
-                raise data
-            else:
-                raise ValueError(error_message)
+        elif not isinstance(data, expected_type):
+            raise ValueError(error_message)
 
         return data
 
     @staticmethod
     async def _send_one_way_rpc_request(request: RPC_REQUEST_T,
                                         socket: Socket):
         """Send one-way RPC request to trigger an action."""
-        # Raise handlable error for graceful shutdown.
+
         if socket.closed:
             raise MQClientClosedError()
 
         await socket.send_multipart((pickle.dumps(request), ))
 
     async def _await_ack(self, error_message: str, socket: Socket):
         """Await acknowledgement that a request succeeded."""
-        # Raise handlable error for graceful shutdown.
+
         if socket.closed:
             raise MQClientClosedError()
 
@@ -325,17 +312,19 @@ async def _await_ack(self, error_message: str, socket: Socket):
 
     @staticmethod
     async def _check_success(error_message: str, socket: Socket):
-        # Raise handlable error for graceful shutdown.
+        """Confirm that socket has a VLLM_RPC_SUCCESS_STR message"""
+
         if socket.closed:
             raise MQClientClosedError()
 
         frame = await socket.recv(copy=False)
         response = pickle.loads(frame.buffer)
 
-        if not isinstance(response, str) or response != VLLM_RPC_SUCCESS_STR:
-            if isinstance(response, BaseException):
-                logger.error(error_message)
-                raise response
+        # Raise error if unsuccessful
+        if isinstance(response, BaseException):
+            raise response
+        elif (not isinstance(response, str) or 
+            response != VLLM_RPC_SUCCESS_STR):
             raise ValueError(error_message)
 
     async def get_tokenizer(self, lora_request: LoRARequest):

diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
@@ -141,9 +141,7 @@ def start(self):
 
     def cleanup(self):
         """Cleanup zeromq state on shutdown."""
-        self.input_socket.close()
-        self.output_socket.close()
-        self.health_socket.close()
+        # Closes all sockets and destroys context.
         self.ctx.destroy(linger=0)
         del self.engine