MarkUsProject · david-yz-liu · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/ai_server/server.py b/ai_server/server.py
@@ -2,6 +2,7 @@
 
 import base64
 import glob
+import json
 import os
 import subprocess
 from typing import Optional
@@ -32,6 +33,7 @@
     if _llama_server_url and not _llama_server_url.startswith(('http://', 'https://'))
     else _llama_server_url
 )
+SCHEMA_KEY = "schema"
 
 
 def _build_messages(content: str, system_prompt: Optional[str] = None, image_files: Optional[list] = None) -> list:
@@ -52,17 +54,21 @@ def chat_with_llama_server_http(
     system_prompt: Optional[str] = None,
     timeout: int = 300,
     image_files: Optional[list] = None,
+    json_schema: Optional[dict] = None,
 ) -> str:
     """Handle chat using llama-server HTTP API."""
     if not LLAMA_SERVER_URL:
         raise Exception("LLAMA_SERVER_URL environment variable not set")
 
     try:
         messages = _build_messages(content, system_prompt, image_files=[])  # TODO: Pass image files
+        payload = {'model': model, 'messages': messages, 'stream': False, 'max_tokens': 512}
+        if json_schema:
+            payload['json_schema'] = json_schema[SCHEMA_KEY]
 
         response = requests.post(
             f'{LLAMA_SERVER_URL}/v1/chat/completions',
-            json={'model': model, 'messages': messages, 'stream': False, 'max_tokens': 512},
+            json=payload,
             headers={'Content-Type': 'application/json'},
             timeout=timeout,
         )
@@ -95,12 +101,21 @@ def is_llamacpp_available(model: str) -> bool:
 
 
 def chat_with_ollama(
-    model: str, content: str, system_prompt: Optional[str] = None, image_files: Optional[list] = None
+    model: str,
+    content: str,
+    system_prompt: Optional[str] = None,
+    image_files: Optional[list] = None,
+    json_schema: Optional[dict] = None,
 ) -> str:
     """Handle chat using ollama."""
     messages = _build_messages(content, system_prompt, image_files)
 
-    response = ollama.chat(model=model, messages=messages, stream=False)
+    response = ollama.chat(
+        model=model,
+        messages=messages,
+        stream=False,
+        format=json_schema[SCHEMA_KEY] if json_schema else None,
+    )
     return response.message.content
 
 
@@ -110,6 +125,7 @@ def chat_with_llamacpp(
     system_prompt: Optional[str] = None,
     timeout: int = 300,
     image_files: Optional[list] = None,
+    json_schema: Optional[dict] = None,
 ) -> str:
     """Handle chat using llama.cpp CLI."""
     model_path = resolve_model_path(model)
@@ -118,6 +134,9 @@ def chat_with_llamacpp(
         raise ValueError(f"Model not found: {model}")
 
     cmd = [LLAMA_CPP_CLI, '-m', model_path, '--n-gpu-layers', '40', '-p', content, '-n', '512', '--single-turn']
+    if json_schema:
+        raw_schema = json_schema[SCHEMA_KEY] if SCHEMA_KEY in json_schema else json_schema
+        cmd += ["--json-schema", json.dumps(raw_schema)]
 
     # Add system prompt if provided
     if system_prompt:
@@ -152,20 +171,27 @@ def chat_with_model(
     llama_mode: str = "cli",
     system_prompt: Optional[str] = None,
     image_files: Optional[list] = None,
+    json_schema: Optional[dict] = None,
 ) -> str:
     """Route chat request based on llama_mode: server (external), cli, or ollama fallback; and with optional system prompt."""
     if is_llamacpp_available(model):
         if llama_mode == "server":
             if not LLAMA_SERVER_URL:
                 raise Exception("LLAMA_SERVER_URL environment variable not set for server mode")
-            return chat_with_llama_server_http(model, content, system_prompt=system_prompt, image_files=image_files)
+            return chat_with_llama_server_http(
+                model, content, system_prompt=system_prompt, image_files=image_files, json_schema=json_schema
+            )
         elif llama_mode == "cli":
-            return chat_with_llamacpp(model, content, system_prompt=system_prompt, image_files=image_files)
+            return chat_with_llamacpp(
+                model, content, system_prompt=system_prompt, image_files=image_files, json_schema=json_schema
+            )
         else:
             raise ValueError(f"Invalid llama_mode: '{llama_mode}'. Valid options are 'server' or 'cli'.")
     else:
         # Model not available in llama.cpp, use ollama
-        return chat_with_ollama(model, content, system_prompt=system_prompt, image_files=image_files)
+        return chat_with_ollama(
+            model, content, system_prompt=system_prompt, image_files=image_files, json_schema=json_schema
+        )
 
 
 def authenticate() -> str:
@@ -190,11 +216,14 @@ def chat():
     llama_mode = request.form.get('llama_mode', 'cli')
     system_prompt = request.form.get('system_prompt')
     image_files = list(request.files.values())
+    json_schema = request.form.get('json_schema')
+    if json_schema:
+        json_schema = json.loads(json_schema)
 
     if not content.strip():
         abort(400, description='Missing prompt content')
 
-    response_content = chat_with_model(model, content, llama_mode, system_prompt, image_files)
+    response_content = chat_with_model(model, content, llama_mode, system_prompt, image_files, json_schema)
     return jsonify(response_content)
 
 

diff --git a/test/test_cli_mode.py b/test/test_cli_mode.py
@@ -108,7 +108,9 @@ def test_cli_mode_uses_llamacpp_when_available(self):
 
         assert result == "CLI response from DeepSeek V3"
         self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
-        self.mock_chat_llamacpp.assert_called_once_with(TEST_LLAMACPP_MODEL, 'Write a function', system_prompt=None, image_files=None)
+        self.mock_chat_llamacpp.assert_called_once_with(
+            TEST_LLAMACPP_MODEL, 'Write a function', system_prompt=None, image_files=None, json_schema=None
+        )
 
     def test_cli_mode_fallback_to_ollama_when_unavailable(self):
         """Test CLI mode falls back to ollama when model not available in llama.cpp."""
@@ -119,7 +121,9 @@ def test_cli_mode_fallback_to_ollama_when_unavailable(self):
 
         assert result == "Ollama response from DeepSeek Coder"
         self.mock_available.assert_called_once_with(TEST_OLLAMA_MODEL)
-        self.mock_chat_ollama.assert_called_once_with(TEST_OLLAMA_MODEL, 'Help with coding', system_prompt=None, image_files=None)
+        self.mock_chat_ollama.assert_called_once_with(
+            TEST_OLLAMA_MODEL, 'Help with coding', system_prompt=None, image_files=None, json_schema=None
+        )
 
     def test_default_mode_is_cli(self):
         """Test that default mode is CLI when no llama_mode specified."""
@@ -130,7 +134,9 @@ def test_default_mode_is_cli(self):
 
         assert result == "Default CLI mode response"
         self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
-        self.mock_chat_llamacpp.assert_called_once_with(TEST_LLAMACPP_MODEL, 'Help me', system_prompt=None, image_files=None)
+        self.mock_chat_llamacpp.assert_called_once_with(
+            TEST_LLAMACPP_MODEL, 'Help me', system_prompt=None, image_files=None, json_schema=None
+        )
 
 
 class TestCLIModeIntegration:
@@ -167,3 +173,24 @@ def test_complete_cli_fallback_flow_to_ollama(self, mock_glob, mock_ollama):
         assert result == "Ollama CLI fallback integration test successful!"
         mock_glob.assert_called_once_with(f'/data1/GGUF/{TEST_OLLAMA_MODEL}/*.gguf')
         mock_ollama.assert_called_once()
+
+    def test_cli_mode_passes_json_schema_to_ollama(self, tmp_path):
+        """
+        When json_schema is supplied, chat_with_model should forward the parsed
+        schema (as a dict) to chat_with_ollama.
+        """
+        test_schema = {"schema": {"type": "object", "properties": {"answer": {"type": "string"}}}}
+
+        #  Prepare mocks
+        with patch('ai_server.server.is_llamacpp_available', return_value=False), patch(
+            'ai_server.server.chat_with_ollama'
+        ) as mock_ollama:
+            mock_ollama.return_value = "schema-aware response"
+
+            result = chat_with_model(TEST_OLLAMA_MODEL, "Give me an answer", llama_mode='cli', json_schema=test_schema)
+
+            assert result == "schema-aware response"
+
+            mock_ollama.assert_called_once_with(
+                TEST_OLLAMA_MODEL, "Give me an answer", system_prompt=None, image_files=None, json_schema=test_schema
+            )
diff --git a/test/test_core.py b/test/test_core.py
@@ -83,6 +83,7 @@ def test_chat_with_ollama_success(self, mock_ollama):
             model=TEST_OLLAMA_MODEL,
             messages=[{'role': 'user', 'content': 'Help me write a Python function'}],
             stream=False,
+            format=None,
         )
 
     def test_chat_with_ollama_service_unavailable(self, mock_ollama):
@@ -98,3 +99,23 @@ def test_chat_with_ollama_model_not_found(self, mock_ollama):
 
         with pytest.raises(Exception, match="model 'nonexistent:latest' not found"):
             chat_with_ollama('nonexistent:latest', 'Hello')
+
+    def test_chat_with_ollama_with_json_schema(self, mock_ollama, tmp_path):
+        """Ollama chat should forward the JSON schema (format=…) when provided."""
+        # Fake schema file
+        test_schema = {"schema": {"type": "object", "properties": {"answer": {"type": "string"}}}}
+
+        # Mock ollama response
+        mock_response = MagicMock()
+        mock_response.message.content = "42"
+        mock_ollama.return_value = mock_response
+
+        result = chat_with_ollama(TEST_OLLAMA_MODEL, "What is the meaning of life?", json_schema=test_schema)
+
+        assert result == "42"
+        mock_ollama.assert_called_once_with(
+            model=TEST_OLLAMA_MODEL,
+            messages=[{"role": "user", "content": "What is the meaning of life?"}],
+            stream=False,
+            format={"type": "object", "properties": {"answer": {"type": "string"}}},
+        )
diff --git a/test/test_server_mode.py b/test/test_server_mode.py
@@ -111,7 +111,9 @@ def test_server_mode_uses_llamacpp_when_available(self):
 
         assert result == "Server response from DeepSeek V3"
         self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
-        self.mock_chat_server.assert_called_once_with(TEST_LLAMACPP_MODEL, 'Explain code', system_prompt=None, image_files=None)
+        self.mock_chat_server.assert_called_once_with(
+            TEST_LLAMACPP_MODEL, 'Explain code', system_prompt=None, image_files=None, json_schema=None
+        )
 
     def test_server_mode_fallback_to_ollama_when_unavailable(self):
         """Test server mode falls back to ollama when model not available in llama.cpp."""
@@ -122,7 +124,9 @@ def test_server_mode_fallback_to_ollama_when_unavailable(self):
 
         assert result == "Ollama fallback response"
         self.mock_available.assert_called_once_with(TEST_OLLAMA_MODEL)
-        self.mock_chat_ollama.assert_called_once_with(TEST_OLLAMA_MODEL, 'Debug code', system_prompt=None, image_files=None)
+        self.mock_chat_ollama.assert_called_once_with(
+            TEST_OLLAMA_MODEL, 'Debug code', system_prompt=None, image_files=None, json_schema=None
+        )
 
     def test_server_mode_requires_server_url(self):
         """Test server mode requires LLAMA_SERVER_URL to be set."""
@@ -178,3 +182,32 @@ def test_complete_server_fallback_flow_to_ollama(self, mock_glob, mock_ollama, m
         assert result == "Ollama server fallback integration test successful!"
         mock_glob.assert_called_once_with(f'/data1/GGUF/{TEST_OLLAMA_MODEL}/*.gguf')
         mock_ollama.assert_called_once()
+
+    def test_server_mode_passes_json_schema_to_llama_server(self, tmp_path, mock_requests_post, mock_llama_server_url):
+        """
+        chat_with_model (server mode) should forward a json_schema file path
+        and llama-server should receive the parsed schema in its JSON body.
+        """
+        test_schema = {"schema": {"type": "object", "properties": {"answer": {"type": "string"}}}}
+
+        with patch('ai_server.server.is_llamacpp_available', return_value=True):
+            mock_response = MagicMock()
+            mock_response.status_code = 200
+            mock_response.json.return_value = {"choices": [{"message": {"content": "Schema-aware server reply"}}]}
+            mock_requests_post.return_value = mock_response
+
+            result = chat_with_model(
+                TEST_LLAMACPP_MODEL, "Give me an answer", llama_mode="server", json_schema=test_schema
+            )
+
+            assert result == "Schema-aware server reply"
+
+            # Verify POST call
+            mock_requests_post.assert_called_once()
+            args, kwargs = mock_requests_post.call_args
+            assert args[0] == "http://localhost:8080/v1/chat/completions"
+
+            body = kwargs["json"]
+            assert body["model"] == TEST_LLAMACPP_MODEL
+            assert body["messages"][0]["content"] == "Give me an answer"
+            assert body["json_schema"] == {"type": "object", "properties": {"answer": {"type": "string"}}}
diff --git a/test/test_system_prompt.py b/test/test_system_prompt.py
@@ -77,4 +77,6 @@ def test_chat_with_model_routing(self, mock_available, mock_chat):
         mock_chat.return_value = "result"
 
         chat_with_model(TEST_MODEL, TEST_USER_CONTENT, 'cli', TEST_SYSTEM_PROMPT)
-        mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, system_prompt=TEST_SYSTEM_PROMPT, image_files=None)
+        mock_chat.assert_called_once_with(
+            TEST_MODEL, TEST_USER_CONTENT, system_prompt=TEST_SYSTEM_PROMPT, image_files=None, json_schema=None
+        )
diff --git a/test/test_system_prompt_api.py b/test/test_system_prompt_api.py
@@ -39,7 +39,7 @@ def test_api_with_system_prompt(self, mock_chat, mock_redis, client):
 
         assert response.status_code == 200
 
-        mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, 'cli', TEST_SYSTEM_PROMPT, [])
+        mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, 'cli', TEST_SYSTEM_PROMPT, [], None)
 
     @patch('ai_server.server.REDIS_CONNECTION')
     @patch('ai_server.server.chat_with_model')
@@ -54,7 +54,7 @@ def test_api_without_system_prompt(self, mock_chat, mock_redis, client):
 
         assert response.status_code == 200
 
-        mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, 'cli', None, [])
+        mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, 'cli', None, [], None)
 
     @patch('ai_server.server.REDIS_CONNECTION')
     def test_api_authentication_still_required(self, mock_redis, client):