1010import importlib .util
1111import inspect
1212import ipaddress
13+ import multiprocessing
1314import os
1415import re
1516import resource
2021import tempfile
2122import threading
2223import time
24+ import traceback
2325import uuid
2426import warnings
2527import weakref
2931from dataclasses import dataclass , field
3032from functools import lru_cache , partial , wraps
3133from typing import (TYPE_CHECKING , Any , AsyncGenerator , Awaitable , Callable ,
32- Dict , Generator , Generic , List , Literal , NamedTuple ,
33- Optional , Tuple , Type , TypeVar , Union , overload )
34+ Dict , Generator , Generic , Iterator , List , Literal ,
35+ NamedTuple , Optional , Tuple , Type , TypeVar , Union ,
36+ overload )
3437from uuid import uuid4
3538
3639import numpy as np
3942import torch
4043import torch .types
4144import yaml
45+ import zmq
46+ import zmq .asyncio
4247from packaging .version import Version
4348from torch .library import Library
4449from typing_extensions import ParamSpec , TypeIs , assert_never
@@ -1844,7 +1849,7 @@ def memory_profiling(
18441849 result .non_kv_cache_memory_in_bytes = result .non_torch_increase_in_bytes + result .torch_peak_increase_in_bytes + result .weights_memory_in_bytes # noqa
18451850
18461851
1847- # Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840 /python/sglang/srt/utils.py#L630 # noqa: E501Curre
1852+ # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1 /python/sglang/srt/utils.py#L630 # noqa: E501
18481853def set_ulimit (target_soft_limit = 65535 ):
18491854 resource_type = resource .RLIMIT_NOFILE
18501855 current_soft , current_hard = resource .getrlimit (resource_type )
@@ -1859,3 +1864,82 @@ def set_ulimit(target_soft_limit=65535):
18591864 "with error %s. This can cause fd limit errors like"
18601865 "`OSError: [Errno 24] Too many open files`. Consider "
18611866 "increasing with ulimit -n" , current_soft , e )
1867+
1868+
1869+ # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501
1870+ def get_exception_traceback ():
1871+ etype , value , tb = sys .exc_info ()
1872+ err_str = "" .join (traceback .format_exception (etype , value , tb ))
1873+ return err_str
1874+
1875+
1876+ # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501
1877+ def make_zmq_socket (
1878+ ctx : Union [zmq .asyncio .Context , zmq .Context ], # type: ignore[name-defined]
1879+ path : str ,
1880+ type : Any ,
1881+ ) -> Union [zmq .Socket , zmq .asyncio .Socket ]: # type: ignore[name-defined]
1882+ """Make a ZMQ socket with the proper bind/connect semantics."""
1883+
1884+ mem = psutil .virtual_memory ()
1885+ socket = ctx .socket (type )
1886+
1887+ # Calculate buffer size based on system memory
1888+ total_mem = mem .total / 1024 ** 3
1889+ available_mem = mem .available / 1024 ** 3
1890+ # For systems with substantial memory (>32GB total, >16GB available):
1891+ # - Set a large 0.5GB buffer to improve throughput
1892+ # For systems with less memory:
1893+ # - Use system default (-1) to avoid excessive memory consumption
1894+ if total_mem > 32 and available_mem > 16 :
1895+ buf_size = int (0.5 * 1024 ** 3 ) # 0.5GB in bytes
1896+ else :
1897+ buf_size = - 1 # Use system default buffer size
1898+
1899+ if type == zmq .constants .PULL :
1900+ socket .setsockopt (zmq .constants .RCVHWM , 0 )
1901+ socket .setsockopt (zmq .constants .RCVBUF , buf_size )
1902+ socket .connect (path )
1903+ elif type == zmq .constants .PUSH :
1904+ socket .setsockopt (zmq .constants .SNDHWM , 0 )
1905+ socket .setsockopt (zmq .constants .SNDBUF , buf_size )
1906+ socket .bind (path )
1907+ else :
1908+ raise ValueError (f"Unknown Socket Type: { type } " )
1909+
1910+ return socket
1911+
1912+
1913+ @contextlib .contextmanager
1914+ def zmq_socket_ctx (
1915+ path : str ,
1916+ type : Any ) -> Iterator [zmq .Socket ]: # type: ignore[name-defined]
1917+ """Context manager for a ZMQ socket"""
1918+
1919+ ctx = zmq .Context (io_threads = 2 ) # type: ignore[attr-defined]
1920+ try :
1921+ yield make_zmq_socket (ctx , path , type )
1922+
1923+ except KeyboardInterrupt :
1924+ logger .debug ("Got Keyboard Interrupt." )
1925+
1926+ finally :
1927+ ctx .destroy (linger = 0 )
1928+
1929+
1930+ def _check_multiproc_method ():
1931+ if (cuda_is_initialized ()
1932+ and os .environ .get ("VLLM_WORKER_MULTIPROC_METHOD" ) != "spawn" ):
1933+ logger .warning ("CUDA was previously initialized. We must use "
1934+ "the `spawn` multiprocessing start method. Setting "
1935+ "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
1936+ "See https://docs.vllm.ai/en/latest/getting_started/"
1937+ "debugging.html#python-multiprocessing "
1938+ "for more information." )
1939+ os .environ ["VLLM_WORKER_MULTIPROC_METHOD" ] = "spawn"
1940+
1941+
1942+ def get_mp_context ():
1943+ _check_multiproc_method ()
1944+ mp_method = envs .VLLM_WORKER_MULTIPROC_METHOD
1945+ return multiprocessing .get_context (mp_method )
0 commit comments