fix: Lazy tokenizer init in StructuredOutputManager to prevent semaphore leak

kitaekatt · kitaekatt · commit c163473e84e6 · 2025-12-08T18:00:24.000-06:00
GGUF models without precomputed merges trigger `build_merges_on_the_fly`
in the transformers library, which uses multiprocessing primitives.
When this happens in both the APIServer process (for request validation)
and the EngineCore subprocess (via StructuredOutputManager), the
subprocess leaks a semaphore, causing the server to hang indefinitely.

This change makes tokenizer initialization lazy in StructuredOutputManager:
- Tokenizer is only loaded when grammar_init() is first called
- Most inference requests don't use structured output, so the tokenizer
  in EngineCore is never loaded
- For requests that do use structured output, tokenizer is loaded on-demand

The fix resolves the following symptoms:
- Server hangs after "resource_tracker: There appear to be 1 leaked
  semaphore objects to clean up at shutdown"
- Tokenizer merges being built twice (once in APIServer, once in EngineCore)
- GGUF models failing to start even though weights load successfully

Tested with bartowski/Phi-3.5-mini-instruct-GGUF (Q5_K_M).
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
@@ -63,39 +63,62 @@ def __init__(self, vllm_config: VllmConfig):
             max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
             self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
 
-        if not self.vllm_config.model_config.skip_tokenizer_init:
-            # The default max_workers if not specified is the number of
-            # CPUs * 5, which is way too high since these tasks are CPU-bound,
-            # not I/O bound. We also know we would never dominate CPU usage
-            # with just grammar compilation, so we set it to half the number
-            # of CPUs.
-            max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
-            self.executor = ThreadPoolExecutor(max_workers=max_workers)
-            self.tokenizer = init_tokenizer_from_config(
-                model_config=self.vllm_config.model_config
-            )
-            reasoning_parser = (
-                self.vllm_config.structured_outputs_config.reasoning_parser
-            )
-            reasoning_parser_plugin = (
-                self.vllm_config.structured_outputs_config.reasoning_parser_plugin
-            )
-            if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
-                ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
-
-            reasoning_parser = (
-                self.vllm_config.structured_outputs_config.reasoning_parser
-            )
-            if reasoning_parser:
-                reasoner_cls = ReasoningParserManager.get_reasoning_parser(
-                    reasoning_parser
-                )
-                self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
+        # Tokenizer is loaded lazily to avoid duplicate tokenizer initialization
+        # in multiprocess mode. For GGUF models, this prevents a semaphore leak
+        # that causes server hangs (tokenizer builds merges on the fly, which
+        # uses multiprocessing primitives that don't clean up in subprocesses).
+        self._tokenizer = None
+        self._tokenizer_initialized = False
+        self.executor = None
 
         self.enable_in_reasoning = (
             self.vllm_config.structured_outputs_config.enable_in_reasoning
         )
 
+    @property
+    def tokenizer(self):
+        """Lazily initialize tokenizer when first accessed."""
+        if not self._tokenizer_initialized:
+            self._init_tokenizer()
+        return self._tokenizer
+
+    def _init_tokenizer(self):
+        """Initialize tokenizer and related components on first use."""
+        if self._tokenizer_initialized:
+            return
+
+        if self.vllm_config.model_config.skip_tokenizer_init:
+            self._tokenizer_initialized = True
+            return
+
+        # The default max_workers if not specified is the number of
+        # CPUs * 5, which is way too high since these tasks are CPU-bound,
+        # not I/O bound. We also know we would never dominate CPU usage
+        # with just grammar compilation, so we set it to half the number
+        # of CPUs.
+        max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
+        self.executor = ThreadPoolExecutor(max_workers=max_workers)
+        self._tokenizer = init_tokenizer_from_config(
+            model_config=self.vllm_config.model_config
+        )
+
+        reasoning_parser = (
+            self.vllm_config.structured_outputs_config.reasoning_parser
+        )
+        reasoning_parser_plugin = (
+            self.vllm_config.structured_outputs_config.reasoning_parser_plugin
+        )
+        if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
+            ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
+
+        if reasoning_parser:
+            reasoner_cls = ReasoningParserManager.get_reasoning_parser(
+                reasoning_parser
+            )
+            self.reasoner = reasoner_cls(tokenizer=self._tokenizer)
+
+        self._tokenizer_initialized = True
+
     def grammar_init(self, request: Request) -> None:
         if request.structured_output_request is None:
             return
@@ -149,6 +172,9 @@ def grammar_init(self, request: Request) -> None:
                 raise ValueError(f"Unsupported structured output backend: {backend}")
 
         if self._use_async_grammar_compilation:
+            # Ensure tokenizer (and executor) is initialized
+            _ = self.tokenizer
+            assert self.executor is not None, "Executor should be initialized with tokenizer"
             grammar = self.executor.submit(self._create_grammar, request)
         else:
             grammar = self._create_grammar(request)  # type: ignore[assignment]