Skip to content

Commit c163473

Browse files
committed
fix: Lazy tokenizer init in StructuredOutputManager to prevent semaphore leak
GGUF models without precomputed merges trigger `build_merges_on_the_fly` in the transformers library, which uses multiprocessing primitives. When this happens in both the APIServer process (for request validation) and the EngineCore subprocess (via StructuredOutputManager), the subprocess leaks a semaphore, causing the server to hang indefinitely. This change makes tokenizer initialization lazy in StructuredOutputManager: - Tokenizer is only loaded when grammar_init() is first called - Most inference requests don't use structured output, so the tokenizer in EngineCore is never loaded - For requests that do use structured output, tokenizer is loaded on-demand The fix resolves the following symptoms: - Server hangs after "resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown" - Tokenizer merges being built twice (once in APIServer, once in EngineCore) - GGUF models failing to start even though weights load successfully Tested with bartowski/Phi-3.5-mini-instruct-GGUF (Q5_K_M).
1 parent 541a2ef commit c163473

File tree

1 file changed

+54
-28
lines changed

1 file changed

+54
-28
lines changed

vllm/v1/structured_output/__init__.py

Lines changed: 54 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -63,39 +63,62 @@ def __init__(self, vllm_config: VllmConfig):
6363
max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
6464
self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
6565

66-
if not self.vllm_config.model_config.skip_tokenizer_init:
67-
# The default max_workers if not specified is the number of
68-
# CPUs * 5, which is way too high since these tasks are CPU-bound,
69-
# not I/O bound. We also know we would never dominate CPU usage
70-
# with just grammar compilation, so we set it to half the number
71-
# of CPUs.
72-
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
73-
self.executor = ThreadPoolExecutor(max_workers=max_workers)
74-
self.tokenizer = init_tokenizer_from_config(
75-
model_config=self.vllm_config.model_config
76-
)
77-
reasoning_parser = (
78-
self.vllm_config.structured_outputs_config.reasoning_parser
79-
)
80-
reasoning_parser_plugin = (
81-
self.vllm_config.structured_outputs_config.reasoning_parser_plugin
82-
)
83-
if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
84-
ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
85-
86-
reasoning_parser = (
87-
self.vllm_config.structured_outputs_config.reasoning_parser
88-
)
89-
if reasoning_parser:
90-
reasoner_cls = ReasoningParserManager.get_reasoning_parser(
91-
reasoning_parser
92-
)
93-
self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
66+
# Tokenizer is loaded lazily to avoid duplicate tokenizer initialization
67+
# in multiprocess mode. For GGUF models, this prevents a semaphore leak
68+
# that causes server hangs (tokenizer builds merges on the fly, which
69+
# uses multiprocessing primitives that don't clean up in subprocesses).
70+
self._tokenizer = None
71+
self._tokenizer_initialized = False
72+
self.executor = None
9473

9574
self.enable_in_reasoning = (
9675
self.vllm_config.structured_outputs_config.enable_in_reasoning
9776
)
9877

78+
@property
79+
def tokenizer(self):
80+
"""Lazily initialize tokenizer when first accessed."""
81+
if not self._tokenizer_initialized:
82+
self._init_tokenizer()
83+
return self._tokenizer
84+
85+
def _init_tokenizer(self):
86+
"""Initialize tokenizer and related components on first use."""
87+
if self._tokenizer_initialized:
88+
return
89+
90+
if self.vllm_config.model_config.skip_tokenizer_init:
91+
self._tokenizer_initialized = True
92+
return
93+
94+
# The default max_workers if not specified is the number of
95+
# CPUs * 5, which is way too high since these tasks are CPU-bound,
96+
# not I/O bound. We also know we would never dominate CPU usage
97+
# with just grammar compilation, so we set it to half the number
98+
# of CPUs.
99+
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
100+
self.executor = ThreadPoolExecutor(max_workers=max_workers)
101+
self._tokenizer = init_tokenizer_from_config(
102+
model_config=self.vllm_config.model_config
103+
)
104+
105+
reasoning_parser = (
106+
self.vllm_config.structured_outputs_config.reasoning_parser
107+
)
108+
reasoning_parser_plugin = (
109+
self.vllm_config.structured_outputs_config.reasoning_parser_plugin
110+
)
111+
if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
112+
ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
113+
114+
if reasoning_parser:
115+
reasoner_cls = ReasoningParserManager.get_reasoning_parser(
116+
reasoning_parser
117+
)
118+
self.reasoner = reasoner_cls(tokenizer=self._tokenizer)
119+
120+
self._tokenizer_initialized = True
121+
99122
def grammar_init(self, request: Request) -> None:
100123
if request.structured_output_request is None:
101124
return
@@ -149,6 +172,9 @@ def grammar_init(self, request: Request) -> None:
149172
raise ValueError(f"Unsupported structured output backend: {backend}")
150173

151174
if self._use_async_grammar_compilation:
175+
# Ensure tokenizer (and executor) is initialized
176+
_ = self.tokenizer
177+
assert self.executor is not None, "Executor should be initialized with tokenizer"
152178
grammar = self.executor.submit(self._create_grammar, request)
153179
else:
154180
grammar = self._create_grammar(request) # type: ignore[assignment]

0 commit comments

Comments
 (0)