|
8 | 8 | import uuid
|
9 | 9 | from threading import Thread
|
10 | 10 | from typing import Optional
|
| 11 | +from unittest.mock import MagicMock |
11 | 12 |
|
12 | 13 | import pytest
|
| 14 | +import torch |
13 | 15 | from transformers import AutoTokenizer
|
14 | 16 |
|
15 | 17 | from tests.utils import multi_gpu_test
|
@@ -517,3 +519,72 @@ def kill_first_child():
|
517 | 519 | )
|
518 | 520 |
|
519 | 521 | assert "Engine core initialization failed" in str(e_info.value)
|
| 522 | + |
| 523 | + |
| 524 | +@create_new_process_for_each_test() |
| 525 | +def test_engine_core_proc_instantiation_cuda_empty( |
| 526 | + monkeypatch: pytest.MonkeyPatch): |
| 527 | + """ |
| 528 | + Test that EngineCoreProc can be instantiated when CUDA_VISIBLE_DEVICES |
| 529 | + is empty. This ensures the engine frontend does not need access to GPUs. |
| 530 | + """ |
| 531 | + |
| 532 | + from vllm.v1.engine.core import EngineCoreProc |
| 533 | + from vllm.v1.executor.abstract import Executor |
| 534 | + |
| 535 | + # Create a simple mock executor instead of a complex custom class |
| 536 | + mock_executor_class = MagicMock(spec=Executor) |
| 537 | + |
| 538 | + def create_mock_executor(vllm_config): |
| 539 | + mock_executor = MagicMock() |
| 540 | + |
| 541 | + # Only implement the methods that are actually called during init |
| 542 | + from vllm.v1.kv_cache_interface import FullAttentionSpec |
| 543 | + mock_spec = FullAttentionSpec(block_size=16, |
| 544 | + num_kv_heads=1, |
| 545 | + head_size=64, |
| 546 | + dtype=torch.float16, |
| 547 | + use_mla=False) |
| 548 | + |
| 549 | + mock_executor.get_kv_cache_specs.return_value = [{ |
| 550 | + "default": mock_spec |
| 551 | + }] |
| 552 | + mock_executor.determine_available_memory.return_value = [ |
| 553 | + 1024 * 1024 * 1024 |
| 554 | + ] |
| 555 | + mock_executor.initialize_from_config.return_value = None |
| 556 | + mock_executor.max_concurrent_batches = 1 |
| 557 | + |
| 558 | + return mock_executor |
| 559 | + |
| 560 | + mock_executor_class.side_effect = create_mock_executor |
| 561 | + |
| 562 | + with monkeypatch.context() as m: |
| 563 | + m.setenv("VLLM_USE_V1", "1") |
| 564 | + m.setenv("CUDA_VISIBLE_DEVICES", "") # No CUDA devices |
| 565 | + |
| 566 | + from vllm.v1.utils import EngineZmqAddresses |
| 567 | + |
| 568 | + def mock_startup_handshake(self, handshake_socket, on_head_node, |
| 569 | + parallel_config): |
| 570 | + return EngineZmqAddresses(inputs=["tcp://127.0.0.1:5555"], |
| 571 | + outputs=["tcp://127.0.0.1:5556"], |
| 572 | + coordinator_input=None, |
| 573 | + coordinator_output=None) |
| 574 | + |
| 575 | + # Background processes are not important here |
| 576 | + m.setattr(EngineCoreProc, "startup_handshake", mock_startup_handshake) |
| 577 | + |
| 578 | + vllm_config = EngineArgs( |
| 579 | + model="deepseek-ai/DeepSeek-V2-Lite", |
| 580 | + trust_remote_code=True).create_engine_config() |
| 581 | + engine_core_proc = EngineCoreProc( |
| 582 | + vllm_config=vllm_config, |
| 583 | + on_head_node=True, |
| 584 | + handshake_address="tcp://127.0.0.1:12345", |
| 585 | + executor_class=mock_executor_class, |
| 586 | + log_stats=False, |
| 587 | + engine_index=0, |
| 588 | + ) |
| 589 | + |
| 590 | + engine_core_proc.shutdown() |
0 commit comments