mlc-ai · tqchen · Apr 23, 2024 · Apr 21, 2024
diff --git a/python/mlc_llm/protocol/openai_api_protocol.py b/python/mlc_llm/protocol/openai_api_protocol.py
@@ -84,7 +84,7 @@ class CompletionRequest(BaseModel):
     API reference: https://platform.openai.com/docs/api-reference/completions/create
     """
 
-    model: str
+    model: Optional[str] = None
     prompt: Union[str, List[int]]
     best_of: int = 1
     echo: bool = False
@@ -154,7 +154,7 @@ class CompletionResponse(BaseModel):
     id: str
     choices: List[CompletionResponseChoice]
     created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
+    model: Optional[str] = None
     object: str = "text_completion"
     usage: UsageInfo = Field(
         default_factory=lambda: UsageInfo()  # pylint: disable=unnecessary-lambda
@@ -200,7 +200,7 @@ class ChatCompletionRequest(BaseModel):
     """
 
     messages: List[ChatCompletionMessage]
-    model: str
+    model: Optional[str] = None
     frequency_penalty: Optional[float] = None
     presence_penalty: Optional[float] = None
     logprobs: bool = False
@@ -343,7 +343,7 @@ class ChatCompletionResponse(BaseModel):
     id: str
     choices: List[ChatCompletionResponseChoice]
     created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
+    model: Optional[str] = None
     system_fingerprint: str
     object: Literal["chat.completion"] = "chat.completion"
     usage: UsageInfo = Field(
@@ -359,7 +359,7 @@ class ChatCompletionStreamResponse(BaseModel):
     id: str
     choices: List[ChatCompletionStreamResponseChoice]
     created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
+    model: Optional[str] = None
     system_fingerprint: str
     object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
     usage: UsageInfo = Field(

diff --git a/python/mlc_llm/serve/engine.py b/python/mlc_llm/serve/engine.py
@@ -61,8 +61,8 @@ async def create(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
         messages: List[Dict[str, Any]],
-        model: str,
         stream: Literal[True],
+        model: Optional[str] = None,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         logprobs: bool = False,
@@ -111,7 +111,7 @@ async def create(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
         messages: List[Dict[str, Any]],
-        model: str,
+        model: Optional[str] = None,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         logprobs: bool = False,
@@ -160,7 +160,7 @@ async def create(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
         messages: List[Dict[str, Any]],
-        model: str,
+        model: Optional[str] = None,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         logprobs: bool = False,
@@ -238,8 +238,8 @@ def create(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
         messages: List[Dict[str, Any]],
-        model: str,
         stream: Literal[True],
+        model: Optional[str] = None,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         logprobs: bool = False,
@@ -288,7 +288,7 @@ def create(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
         messages: List[Dict[str, Any]],
-        model: str,
+        model: Optional[str] = None,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         logprobs: bool = False,
@@ -335,7 +335,7 @@ def create(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
         messages: List[Dict[str, Any]],
-        model: str,
+        model: Optional[str] = None,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         logprobs: bool = False,
@@ -412,9 +412,9 @@ def __init__(self, engine: weakref.ReferenceType) -> None:
     async def create(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
-        model: str,
         prompt: Union[str, List[int]],
         stream: Literal[True],
+        model: Optional[str] = None,
         best_of: int = 1,
         echo: bool = False,
         frequency_penalty: float = 0.0,
@@ -463,8 +463,8 @@ async def create(  # pylint: disable=too-many-arguments,too-many-locals
     async def create(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
-        model: str,
         prompt: Union[str, List[int]],
+        model: Optional[str] = None,
         best_of: int = 1,
         echo: bool = False,
         frequency_penalty: float = 0.0,
@@ -511,8 +511,8 @@ async def create(  # pylint: disable=too-many-arguments,too-many-locals
     async def create(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
-        model: str,
         prompt: Union[str, List[int]],
+        model: Optional[str] = None,
         best_of: int = 1,
         echo: bool = False,
         frequency_penalty: float = 0.0,
@@ -591,9 +591,9 @@ def __init__(self, engine: weakref.ReferenceType) -> None:
     def create(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
-        model: str,
         prompt: Union[str, List[int]],
         stream: Literal[True],
+        model: Optional[str] = None,
         best_of: int = 1,
         echo: bool = False,
         frequency_penalty: float = 0.0,
@@ -642,8 +642,8 @@ def create(  # pylint: disable=too-many-arguments,too-many-locals
     def create(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
-        model: str,
         prompt: Union[str, List[int]],
+        model: Optional[str] = None,
         best_of: int = 1,
         echo: bool = False,
         frequency_penalty: float = 0.0,
@@ -690,8 +690,8 @@ def create(  # pylint: disable=too-many-arguments,too-many-locals
     def create(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
-        model: str,
         prompt: Union[str, List[int]],
+        model: Optional[str] = None,
         best_of: int = 1,
         echo: bool = False,
         frequency_penalty: float = 0.0,
@@ -883,7 +883,7 @@ async def _chat_completion(  # pylint: disable=too-many-arguments,too-many-local
         self,
         *,
         messages: List[Dict[str, Any]],
-        model: str,
+        model: Optional[str] = None,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         logprobs: bool = False,
@@ -1003,8 +1003,8 @@ async def _chat_completion(  # pylint: disable=too-many-arguments,too-many-local
     async def _completion(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
-        model: str,
         prompt: Union[str, List[int]],
+        model: Optional[str] = None,
         best_of: int = 1,
         echo: bool = False,
         frequency_penalty: float = 0.0,
@@ -1429,7 +1429,7 @@ def _chat_completion(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
         messages: List[Dict[str, Any]],
-        model: str,
+        model: Optional[str] = None,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         logprobs: bool = False,
@@ -1549,8 +1549,8 @@ def _chat_completion(  # pylint: disable=too-many-arguments,too-many-locals
     def _completion(  # pylint: disable=too-many-arguments,too-many-locals
         self,
         *,
-        model: str,
         prompt: Union[str, List[int]],
+        model: Optional[str] = None,
         best_of: int = 1,
         echo: bool = False,
         frequency_penalty: float = 0.0,

diff --git a/python/mlc_llm/serve/server/server_context.py b/python/mlc_llm/serve/server/server_context.py
@@ -37,8 +37,11 @@ def add_model(self, hosted_model: str, engine: AsyncLLMEngine) -> None:
             raise RuntimeError(f"Model {hosted_model} already running.")
         self._models[hosted_model] = engine
 
-    def get_engine(self, model: str) -> Optional[AsyncLLMEngine]:
-        """Get the async engine of the requested model."""
+    def get_engine(self, model: Optional[str]) -> Optional[AsyncLLMEngine]:
+        """Get the async engine of the requested model, or the unique async engine
+        if only one engine is served."""
+        if len(self._models) == 1:
+            return next(iter(self._models.values()))
         return self._models.get(model, None)
 
     def get_model_list(self) -> List[str]:

diff --git a/tests/python/serve/server/test_server.py b/tests/python/serve/server/test_server.py
@@ -329,23 +329,6 @@ def test_openai_v1_completions_openai_package(
         )
 
 
-def test_openai_v1_completions_invalid_requested_model(
-    launch_server,  # pylint: disable=unused-argument
-):
-    # `launch_server` is a pytest fixture defined in conftest.py.
-
-    model = "unserved_model"
-    payload = {
-        "model": model,
-        "prompt": "What is the meaning of life?",
-        "max_tokens": 10,
-    }
-    response = requests.post(OPENAI_V1_COMPLETION_URL, json=payload, timeout=180)
-    expect_error(
-        response_str=response.json(), msg_prefix=f'The requested model "{model}" is not served.'
-    )
-
-
 @pytest.mark.parametrize("stream", [False, True])
 def test_openai_v1_completions_echo(
     served_model: Tuple[str, str],
@@ -1319,7 +1302,6 @@ def test_debug_dump_event_trace(
     test_openai_v1_completions(MODEL, None, stream=True)
     test_openai_v1_completions_openai_package(MODEL, None, stream=False)
     test_openai_v1_completions_openai_package(MODEL, None, stream=True)
-    test_openai_v1_completions_invalid_requested_model(None)
     test_openai_v1_completions_echo(MODEL, None, stream=False)
     test_openai_v1_completions_echo(MODEL, None, stream=True)
     test_openai_v1_completions_suffix(MODEL, None, stream=False)