MarkUsProject · david-yz-liu · Jul 29, 2025 · Jul 25, 2025 · Jul 25, 2025 · Jul 25, 2025
diff --git a/ai_server/server.py b/ai_server/server.py
@@ -55,16 +55,23 @@ def chat_with_llama_server_http(
     timeout: int = 300,
     image_files: Optional[list] = None,
     json_schema: Optional[dict] = None,
+    model_options: Optional[dict] = None,
 ) -> str:
     """Handle chat using llama-server HTTP API."""
     if not LLAMA_SERVER_URL:
         raise Exception("LLAMA_SERVER_URL environment variable not set")
 
     try:
         messages = _build_messages(content, system_prompt, image_files=[])  # TODO: Pass image files
-        payload = {'model': model, 'messages': messages, 'stream': False, 'max_tokens': 512}
+
+        if not model_options:
+            model_options = {}
+
+        payload = {'model': model, 'messages': messages, **model_options}
         if json_schema:
             payload['json_schema'] = json_schema[SCHEMA_KEY]
+        if 'stream' not in payload:
+            payload['stream'] = False
 
         response = requests.post(
             f'{LLAMA_SERVER_URL}/v1/chat/completions',
@@ -106,6 +113,7 @@ def chat_with_ollama(
     system_prompt: Optional[str] = None,
     image_files: Optional[list] = None,
     json_schema: Optional[dict] = None,
+    model_options: Optional[dict] = None,
 ) -> str:
     """Handle chat using ollama."""
     messages = _build_messages(content, system_prompt, image_files)
@@ -115,6 +123,7 @@ def chat_with_ollama(
         messages=messages,
         stream=False,
         format=json_schema[SCHEMA_KEY] if json_schema else None,
+        options=model_options,
     )
     return response.message.content
 
@@ -125,6 +134,7 @@ def chat_with_llamacpp(
     system_prompt: Optional[str] = None,
     timeout: int = 300,
     image_files: Optional[list] = None,
+    model_options: Optional[dict] = None,
     json_schema: Optional[dict] = None,
 ) -> str:
     """Handle chat using llama.cpp CLI."""
@@ -142,6 +152,10 @@ def chat_with_llamacpp(
     if system_prompt:
         cmd.extend(['--system-prompt', system_prompt])
 
+    if model_options:
+        for key, value in model_options.items():
+            cmd.extend(['--model-option', key, value])
+
     if image_files:
         pass  # TODO: pass image files
 
@@ -171,6 +185,7 @@ def chat_with_model(
     llama_mode: str = "cli",
     system_prompt: Optional[str] = None,
     image_files: Optional[list] = None,
+    model_options: Optional[dict] = None,
     json_schema: Optional[dict] = None,
 ) -> str:
     """Route chat request based on llama_mode: server (external), cli, or ollama fallback; and with optional system prompt."""
@@ -179,18 +194,33 @@ def chat_with_model(
             if not LLAMA_SERVER_URL:
                 raise Exception("LLAMA_SERVER_URL environment variable not set for server mode")
             return chat_with_llama_server_http(
-                model, content, system_prompt=system_prompt, image_files=image_files, json_schema=json_schema
+                model,
+                content,
+                system_prompt=system_prompt,
+                image_files=image_files,
+                json_schema=json_schema,
+                model_options=model_options,
             )
         elif llama_mode == "cli":
             return chat_with_llamacpp(
-                model, content, system_prompt=system_prompt, image_files=image_files, json_schema=json_schema
+                model,
+                content,
+                system_prompt=system_prompt,
+                image_files=image_files,
+                json_schema=json_schema,
+                model_options=model_options,
             )
         else:
             raise ValueError(f"Invalid llama_mode: '{llama_mode}'. Valid options are 'server' or 'cli'.")
     else:
         # Model not available in llama.cpp, use ollama
         return chat_with_ollama(
-            model, content, system_prompt=system_prompt, image_files=image_files, json_schema=json_schema
+            model,
+            content,
+            system_prompt=system_prompt,
+            image_files=image_files,
+            json_schema=json_schema,
+            model_options=model_options,
         )
 
 
@@ -216,14 +246,17 @@ def chat():
     llama_mode = request.form.get('llama_mode', 'cli')
     system_prompt = request.form.get('system_prompt')
     image_files = list(request.files.values())
+    model_options = request.form.get('model_options')
     json_schema = request.form.get('json_schema')
     if json_schema:
         json_schema = json.loads(json_schema)
 
     if not content.strip():
         abort(400, description='Missing prompt content')
 
-    response_content = chat_with_model(model, content, llama_mode, system_prompt, image_files, json_schema)
+    response_content = chat_with_model(
+        model, content, llama_mode, system_prompt, image_files, model_options=model_options, json_schema=json_schema
+    )
     return jsonify(response_content)
 
 

diff --git a/test/test_cli_mode.py b/test/test_cli_mode.py
@@ -109,7 +109,12 @@ def test_cli_mode_uses_llamacpp_when_available(self):
         assert result == "CLI response from DeepSeek V3"
         self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
         self.mock_chat_llamacpp.assert_called_once_with(
-            TEST_LLAMACPP_MODEL, 'Write a function', system_prompt=None, image_files=None, json_schema=None
+            TEST_LLAMACPP_MODEL,
+            'Write a function',
+            system_prompt=None,
+            image_files=None,
+            json_schema=None,
+            model_options=None,
         )
 
     def test_cli_mode_fallback_to_ollama_when_unavailable(self):
@@ -122,7 +127,12 @@ def test_cli_mode_fallback_to_ollama_when_unavailable(self):
         assert result == "Ollama response from DeepSeek Coder"
         self.mock_available.assert_called_once_with(TEST_OLLAMA_MODEL)
         self.mock_chat_ollama.assert_called_once_with(
-            TEST_OLLAMA_MODEL, 'Help with coding', system_prompt=None, image_files=None, json_schema=None
+            TEST_OLLAMA_MODEL,
+            'Help with coding',
+            system_prompt=None,
+            image_files=None,
+            json_schema=None,
+            model_options=None,
         )
 
     def test_default_mode_is_cli(self):
@@ -135,7 +145,30 @@ def test_default_mode_is_cli(self):
         assert result == "Default CLI mode response"
         self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
         self.mock_chat_llamacpp.assert_called_once_with(
-            TEST_LLAMACPP_MODEL, 'Help me', system_prompt=None, image_files=None, json_schema=None
+            TEST_LLAMACPP_MODEL, 'Help me', system_prompt=None, image_files=None, json_schema=None, model_options=None
+        )
+        self.mock_chat_llamacpp.assert_called_once_with(
+            TEST_LLAMACPP_MODEL, 'Help me', system_prompt=None, image_files=None, model_options=None, json_schema=None
+        )
+
+    def test_model_options(self):
+        """Test that model_options are passed correctly in CLI mode."""
+        self.mock_available.return_value = True
+        self.mock_chat_llamacpp.return_value = "Model options test response"
+
+        test_options = {"temperature": 0.7, "top_p": 0.9}
+
+        result = chat_with_model(TEST_LLAMACPP_MODEL, 'Help me', model_options=test_options)
+
+        assert result == "Model options test response"
+        self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
+        self.mock_chat_llamacpp.assert_called_once_with(
+            TEST_LLAMACPP_MODEL,
+            'Help me',
+            system_prompt=None,
+            image_files=None,
+            json_schema=None,
+            model_options=test_options,
         )
 
 
@@ -192,5 +225,10 @@ def test_cli_mode_passes_json_schema_to_ollama(self, tmp_path):
             assert result == "schema-aware response"
 
             mock_ollama.assert_called_once_with(
-                TEST_OLLAMA_MODEL, "Give me an answer", system_prompt=None, image_files=None, json_schema=test_schema
+                TEST_OLLAMA_MODEL,
+                "Give me an answer",
+                system_prompt=None,
+                image_files=None,
+                json_schema=test_schema,
+                model_options=None,
             )
diff --git a/test/test_core.py b/test/test_core.py
@@ -83,6 +83,7 @@ def test_chat_with_ollama_success(self, mock_ollama):
             model=TEST_OLLAMA_MODEL,
             messages=[{'role': 'user', 'content': 'Help me write a Python function'}],
             stream=False,
+            options=None,
             format=None,
         )
 
@@ -110,12 +111,15 @@ def test_chat_with_ollama_with_json_schema(self, mock_ollama, tmp_path):
         mock_response.message.content = "42"
         mock_ollama.return_value = mock_response
 
-        result = chat_with_ollama(TEST_OLLAMA_MODEL, "What is the meaning of life?", json_schema=test_schema)
+        result = chat_with_ollama(
+            TEST_OLLAMA_MODEL, "What is the meaning of life?", json_schema=test_schema, model_options=None
+        )
 
         assert result == "42"
         mock_ollama.assert_called_once_with(
             model=TEST_OLLAMA_MODEL,
             messages=[{"role": "user", "content": "What is the meaning of life?"}],
             stream=False,
-            format={"type": "object", "properties": {"answer": {"type": "string"}}},
+            format={'type': 'object', 'properties': {'answer': {'type': 'string'}}},
+            options=None,
         )
diff --git a/test/test_server_mode.py b/test/test_server_mode.py
@@ -112,7 +112,12 @@ def test_server_mode_uses_llamacpp_when_available(self):
         assert result == "Server response from DeepSeek V3"
         self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
         self.mock_chat_server.assert_called_once_with(
-            TEST_LLAMACPP_MODEL, 'Explain code', system_prompt=None, image_files=None, json_schema=None
+            TEST_LLAMACPP_MODEL,
+            'Explain code',
+            system_prompt=None,
+            image_files=None,
+            json_schema=None,
+            model_options=None,
         )
 
     def test_server_mode_fallback_to_ollama_when_unavailable(self):
@@ -125,7 +130,30 @@ def test_server_mode_fallback_to_ollama_when_unavailable(self):
         assert result == "Ollama fallback response"
         self.mock_available.assert_called_once_with(TEST_OLLAMA_MODEL)
         self.mock_chat_ollama.assert_called_once_with(
-            TEST_OLLAMA_MODEL, 'Debug code', system_prompt=None, image_files=None, json_schema=None
+            TEST_OLLAMA_MODEL, 'Debug code', system_prompt=None, image_files=None, json_schema=None, model_options=None
+        )
+        self.mock_chat_ollama.assert_called_once_with(
+            TEST_OLLAMA_MODEL, 'Debug code', system_prompt=None, image_files=None, json_schema=None, model_options=None
+        )
+
+    def test_server_mode_fallback_to_ollama_with_model_options(self):
+        """Test server mode falls back to ollama and passes model_options correctly when model is unavailable in llama.cpp."""
+        self.mock_available.return_value = False
+        self.mock_chat_ollama.return_value = "Ollama fallback response"
+
+        test_options = {"temperature": 0.3, "top_k": 50}
+
+        result = chat_with_model(TEST_OLLAMA_MODEL, 'Debug code', llama_mode='server', model_options=test_options)
+
+        assert result == "Ollama fallback response"
+        self.mock_available.assert_called_once_with(TEST_OLLAMA_MODEL)
+        self.mock_chat_ollama.assert_called_once_with(
+            TEST_OLLAMA_MODEL,
+            'Debug code',
+            system_prompt=None,
+            image_files=None,
+            model_options=test_options,
+            json_schema=None,
         )
 
     def test_server_mode_requires_server_url(self):

diff --git a/test/test_system_prompt.py b/test/test_system_prompt.py
@@ -78,5 +78,10 @@ def test_chat_with_model_routing(self, mock_available, mock_chat):
 
         chat_with_model(TEST_MODEL, TEST_USER_CONTENT, 'cli', TEST_SYSTEM_PROMPT)
         mock_chat.assert_called_once_with(
-            TEST_MODEL, TEST_USER_CONTENT, system_prompt=TEST_SYSTEM_PROMPT, image_files=None, json_schema=None
+            TEST_MODEL,
+            TEST_USER_CONTENT,
+            system_prompt=TEST_SYSTEM_PROMPT,
+            image_files=None,
+            json_schema=None,
+            model_options=None,
         )
diff --git a/test/test_system_prompt_api.py b/test/test_system_prompt_api.py
@@ -1,3 +1,4 @@
+import json
 from unittest.mock import patch
 
 import pytest
@@ -39,7 +40,9 @@ def test_api_with_system_prompt(self, mock_chat, mock_redis, client):
 
         assert response.status_code == 200
 
-        mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, 'cli', TEST_SYSTEM_PROMPT, [], None)
+        mock_chat.assert_called_once_with(
+            TEST_MODEL, TEST_USER_CONTENT, 'cli', TEST_SYSTEM_PROMPT, [], json_schema=None, model_options=None
+        )
 
     @patch('ai_server.server.REDIS_CONNECTION')
     @patch('ai_server.server.chat_with_model')
@@ -54,7 +57,9 @@ def test_api_without_system_prompt(self, mock_chat, mock_redis, client):
 
         assert response.status_code == 200
 
-        mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, 'cli', None, [], None)
+        mock_chat.assert_called_once_with(
+            TEST_MODEL, TEST_USER_CONTENT, 'cli', None, [], model_options=None, json_schema=None
+        )
 
     @patch('ai_server.server.REDIS_CONNECTION')
     def test_api_authentication_still_required(self, mock_redis, client):