triton-inference-server · oandreeva-nv · Feb 21, 2025 · Apr 15, 2025 · Apr 15, 2025 · Apr 15, 2025
diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py
@@ -83,13 +83,21 @@ class TritonLLMEngine(LLMEngine):
     def __init__(
         self,
         server: tritonserver.Server,
-        tokenizer: str,
         backend: Optional[str] = None,
         lora_separator: Optional[str] = None,
+        tokenizer_map: Dict[str, str] = None,
     ):
         # Assume an already configured and started server
         self.server = server
-        self.tokenizer = self._get_tokenizer(tokenizer)
+        self.tokenizer_map = {}
+        if tokenizer_map:
+            for model_name, tokenizer_path in tokenizer_map.items():
+                try:
+                    self.tokenizer_map[model_name] = get_tokenizer(tokenizer_path)
+                except Exception as e:
+                    print(
+                        f"Warning: Failed to load tokenizer for {model_name} from {tokenizer_path}: {e}"
+                    )
         # TODO: Reconsider name of "backend" vs. something like "request_format"
         self.backend = backend
         self.lora_separator = lora_separator
@@ -294,12 +302,13 @@ def _get_model_metadata(self) -> Dict[str, TritonModelMetadata]:
                     self.server.options.model_repository, name, model.version
                 )
 
+            default_tokenizer = self.tokenizer_map.get("default", None)
             metadata = TritonModelMetadata(
                 name=name,
                 backend=backend,
                 model=model,
-                tokenizer=self.tokenizer,
                 lora_names=lora_names,
+                tokenizer=self.tokenizer_map.get(name, default_tokenizer),
                 create_time=self.create_time,
                 request_converter=self._determine_request_converter(backend),
             )

diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py
@@ -82,6 +82,27 @@ def start_kserve_frontends(server, args):
     return http_service, grpc_service
 
 
+def parse_tokenizer_arg(tokenizer_args):
+    if not tokenizer_args:
+        return {}
+
+    tokenizer_map = {}
+    # Single tokenizer case
+    if len(tokenizer_args) == 1 and ":" not in tokenizer_args[0]:
+        tokenizer_map["default"] = tokenizer_args[0]
+        return tokenizer_map
+
+    # Multiple tokenizers case
+    for arg in tokenizer_args:
+        try:
+            model_name, tokenizer_path = arg.split(":")
+            tokenizer_map[model_name] = tokenizer_path
+        except ValueError:
+            print(f"Warning: Skipping invalid tokenizer specification: {arg}")
+
+    return tokenizer_map
+
+
 def parse_args():
     parser = argparse.ArgumentParser(
         description="Triton Inference Server with OpenAI-Compatible RESTful API server."
@@ -98,8 +119,14 @@ def parse_args():
     triton_group.add_argument(
         "--tokenizer",
         type=str,
+        nargs="+",  # Accept either single value or multiple
         default=None,
-        help="HuggingFace ID or local folder path of the Tokenizer to use for chat templates",
+        help=(
+            "HuggingFace ID or local folder path of Tokenizer(s). "
+            "For single tokenizer: provide path directly. "
+            "For multiple tokenizers: use format 'model_name:tokenizer_path' for each entry. "
+            "Example: --tokenizer default:/path/to/tokenizer model1:path1 model2:path2"
+        ),
     )
     triton_group.add_argument(
         "--backend",
@@ -166,8 +193,11 @@ def parse_args():
 def main():
     args = parse_args()
 
-    # Initialize a Triton Inference Server pointing at LLM models
-    server: tritonserver.Server = tritonserver.Server(
+    # Parse tokenizer mappings
+    tokenizer_map = parse_tokenizer_arg(args.tokenizer)
+
+    # Initialize Triton server
+    server = tritonserver.Server(
         model_repository=args.model_repository,
         log_verbose=args.tritonserver_log_verbose_level,
         log_info=True,
@@ -178,9 +208,9 @@ def main():
     # Wrap Triton Inference Server in an interface-conforming "LLMEngine"
     engine: TritonLLMEngine = TritonLLMEngine(
         server=server,
-        tokenizer=args.tokenizer,
         backend=args.backend,
         lora_separator=args.lora_separator,
+        tokenizer_map=tokenizer_map,
     )
 
     # Attach TritonLLMEngine as the backbone for inference and model management

diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -619,3 +619,64 @@ def test_chat_completions_invalid_chat_tokenizer(
         assert any(
             error in response.json()["detail"].lower() for error in expected_errors
         )
+
+
+class TestMultipleTokenizers:
+    @pytest.fixture(scope="class")
+    def model_repository(self):
+        # Custom model repository for these specific tests
+        return str(Path(__file__).parent / "vllm_tiny_models")
+
+    # Re-use a single Triton server for different frontend configurations
+    @pytest.fixture(scope="class")
+    def server(self, model_repository: str):
+        server = setup_server(model_repository)
+        yield server
+        server.stop()
+
+    @pytest.fixture(scope="class")
+    def models(self):
+        return ["tiny_llama", "phi-4"]
+
+    def test_chat_completions_multiple_tokenizers(
+        self,
+        server: tritonserver.Server,
+        models: List[str],
+        messages: List[dict],
+    ):
+        app = setup_fastapi_app(
+            tokenizer={
+                "tiny_llama:TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+                "phi-4:microsoft/Phi-4-mini-instruct"
+            },
+            server=server,
+            backend="vllm",
+        )
+        for model in models:
+            with TestClient(app) as client:
+                response = client.post(
+                    "/v1/chat/completions",
+                    json={"model": model, "messages": messages},
+                )
+
+                assert response.status_code == 200
+                message = response.json()["choices"][0]["message"]
+                assert message["content"].strip()
+                assert message["role"] == "assistant"
+
+    def test_chat_completions_unknown_tokenizers(
+        self,
+        server: tritonserver.Server,
+        models: List[str],
+        messages: List[dict],
+    ):
+        app = setup_fastapi_app(tokenizer="", server=server, backend="vllm")
+        for model in models:
+            with TestClient(app) as client:
+                response = client.post(
+                    "/v1/chat/completions",
+                    json={"model": model, "messages": messages},
+                )
+
+                assert response.status_code == 400
+                assert response.json()["detail"] == "Unknown tokenizer"
diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py
@@ -29,7 +29,7 @@
 import sys
 import time
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 
 import openai
 import requests
@@ -52,7 +52,9 @@ def setup_server(model_repository: str):
     return server
 
 
-def setup_fastapi_app(tokenizer: str, server: tritonserver.Server, backend: str):
+def setup_fastapi_app(
+    tokenizer: Union[str, Dict[str, str]], server: tritonserver.Server, backend: str
+):
     engine: TritonLLMEngine = TritonLLMEngine(
         server=server, tokenizer=tokenizer, backend=backend
     )

diff --git a/python/openai/tests/vllm_tiny_models/phi-4/1/model.json b/python/openai/tests/vllm_tiny_models/phi-4/1/model.json
@@ -0,0 +1 @@
+{"model": "microsoft/Phi-4-mini-instruct", "disable_log_requests": true, "gpu_memory_utilization": 0.8}
diff --git a/python/openai/tests/vllm_tiny_models/phi-4/config.pbtxt b/python/openai/tests/vllm_tiny_models/phi-4/config.pbtxt
@@ -0,0 +1,28 @@
+# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "vllm"
+instance_group [{kind: KIND_MODEL}]
diff --git a/python/openai/tests/vllm_tiny_models/tiny_llama/1/model.json b/python/openai/tests/vllm_tiny_models/tiny_llama/1/model.json
@@ -0,0 +1 @@
+{"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "disable_log_requests": true, "gpu_memory_utilization": 0.2}
diff --git a/python/openai/tests/vllm_tiny_models/tiny_llama/config.pbtxt b/python/openai/tests/vllm_tiny_models/tiny_llama/config.pbtxt
@@ -0,0 +1,28 @@
+# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "vllm"
+instance_group [{kind: KIND_MODEL}]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"model": "microsoft/Phi-4-mini-instruct", "disable_log_requests": true, "gpu_memory_utilization": 0.8}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "disable_log_requests": true, "gpu_memory_utilization": 0.2}