File tree Expand file tree Collapse file tree 2 files changed +12
-1
lines changed Expand file tree Collapse file tree 2 files changed +12
-1
lines changed Original file line number Diff line number Diff line change 123
123
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE : int = 163840
124
124
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS : int = 1
125
125
VLLM_SLEEP_WHEN_IDLE : bool = False
126
+ VLLM_MQ_MAX_CHUNK_BYTES_MB : int = 16
126
127
127
128
128
129
def get_default_cache_root ():
@@ -847,6 +848,12 @@ def get_vllm_port() -> Optional[int]:
847
848
# latency penalty when a request eventually comes.
848
849
"VLLM_SLEEP_WHEN_IDLE" :
849
850
lambda : bool (int (os .getenv ("VLLM_SLEEP_WHEN_IDLE" , "0" ))),
851
+
852
+ # Control the max chunk bytes (in MB) for the rpc message queue.
853
+ # Object larger than this threshold will be broadcast to worker
854
+ # processes via zmq.
855
+ "VLLM_MQ_MAX_CHUNK_BYTES_MB" :
856
+ lambda : int (os .getenv ("VLLM_MQ_MAX_CHUNK_BYTES_MB" , "16" )),
850
857
}
851
858
852
859
# --8<-- [end:env-vars-definition]
Original file line number Diff line number Diff line change 20
20
21
21
import cloudpickle
22
22
23
+ import vllm .envs as envs
23
24
from vllm .config import VllmConfig
24
25
from vllm .distributed import (destroy_distributed_environment ,
25
26
destroy_model_parallel )
@@ -72,7 +73,10 @@ def _init_executor(self) -> None:
72
73
73
74
# Initialize worker and set up message queues for SchedulerOutputs
74
75
# and ModelRunnerOutputs
75
- self .rpc_broadcast_mq = MessageQueue (self .world_size , self .world_size )
76
+ max_chunk_bytes = envs .VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
77
+ self .rpc_broadcast_mq = MessageQueue (self .world_size ,
78
+ self .world_size ,
79
+ max_chunk_bytes = max_chunk_bytes )
76
80
scheduler_output_handle = self .rpc_broadcast_mq .export_handle ()
77
81
78
82
# Create workers
You can’t perform that action at this time.
0 commit comments