Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 36 additions & 7 deletions ai_server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import base64
import glob
import json
import os
import subprocess
from typing import Optional
Expand Down Expand Up @@ -32,6 +33,7 @@
if _llama_server_url and not _llama_server_url.startswith(('http://', 'https://'))
else _llama_server_url
)
SCHEMA_KEY = "schema"


def _build_messages(content: str, system_prompt: Optional[str] = None, image_files: Optional[list] = None) -> list:
Expand All @@ -52,17 +54,21 @@ def chat_with_llama_server_http(
system_prompt: Optional[str] = None,
timeout: int = 300,
image_files: Optional[list] = None,
json_schema: Optional[dict] = None,
) -> str:
"""Handle chat using llama-server HTTP API."""
if not LLAMA_SERVER_URL:
raise Exception("LLAMA_SERVER_URL environment variable not set")

try:
messages = _build_messages(content, system_prompt, image_files=[]) # TODO: Pass image files
payload = {'model': model, 'messages': messages, 'stream': False, 'max_tokens': 512}
if json_schema:
payload['json_schema'] = json_schema[SCHEMA_KEY]

response = requests.post(
f'{LLAMA_SERVER_URL}/v1/chat/completions',
json={'model': model, 'messages': messages, 'stream': False, 'max_tokens': 512},
json=payload,
headers={'Content-Type': 'application/json'},
timeout=timeout,
)
Expand Down Expand Up @@ -95,12 +101,21 @@ def is_llamacpp_available(model: str) -> bool:


def chat_with_ollama(
model: str, content: str, system_prompt: Optional[str] = None, image_files: Optional[list] = None
model: str,
content: str,
system_prompt: Optional[str] = None,
image_files: Optional[list] = None,
json_schema: Optional[dict] = None,
) -> str:
"""Handle chat using ollama."""
messages = _build_messages(content, system_prompt, image_files)

response = ollama.chat(model=model, messages=messages, stream=False)
response = ollama.chat(
model=model,
messages=messages,
stream=False,
format=json_schema[SCHEMA_KEY] if json_schema else None,
)
return response.message.content


Expand All @@ -110,6 +125,7 @@ def chat_with_llamacpp(
system_prompt: Optional[str] = None,
timeout: int = 300,
image_files: Optional[list] = None,
json_schema: Optional[dict] = None,
) -> str:
"""Handle chat using llama.cpp CLI."""
model_path = resolve_model_path(model)
Expand All @@ -118,6 +134,9 @@ def chat_with_llamacpp(
raise ValueError(f"Model not found: {model}")

cmd = [LLAMA_CPP_CLI, '-m', model_path, '--n-gpu-layers', '40', '-p', content, '-n', '512', '--single-turn']
if json_schema:
raw_schema = json_schema[SCHEMA_KEY] if SCHEMA_KEY in json_schema else json_schema
cmd += ["--json-schema", json.dumps(raw_schema)]
Copy link
Collaborator

@Rolland-He Rolland-He Jul 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.


# Add system prompt if provided
if system_prompt:
Expand Down Expand Up @@ -152,20 +171,27 @@ def chat_with_model(
llama_mode: str = "cli",
system_prompt: Optional[str] = None,
image_files: Optional[list] = None,
json_schema: Optional[dict] = None,
) -> str:
"""Route chat request based on llama_mode: server (external), cli, or ollama fallback; and with optional system prompt."""
if is_llamacpp_available(model):
if llama_mode == "server":
if not LLAMA_SERVER_URL:
raise Exception("LLAMA_SERVER_URL environment variable not set for server mode")
return chat_with_llama_server_http(model, content, system_prompt=system_prompt, image_files=image_files)
return chat_with_llama_server_http(
model, content, system_prompt=system_prompt, image_files=image_files, json_schema=json_schema
)
elif llama_mode == "cli":
return chat_with_llamacpp(model, content, system_prompt=system_prompt, image_files=image_files)
return chat_with_llamacpp(
model, content, system_prompt=system_prompt, image_files=image_files, json_schema=json_schema
)
else:
raise ValueError(f"Invalid llama_mode: '{llama_mode}'. Valid options are 'server' or 'cli'.")
else:
# Model not available in llama.cpp, use ollama
return chat_with_ollama(model, content, system_prompt=system_prompt, image_files=image_files)
return chat_with_ollama(
model, content, system_prompt=system_prompt, image_files=image_files, json_schema=json_schema
)


def authenticate() -> str:
Expand All @@ -190,11 +216,14 @@ def chat():
llama_mode = request.form.get('llama_mode', 'cli')
system_prompt = request.form.get('system_prompt')
image_files = list(request.files.values())
json_schema = request.form.get('json_schema')
if json_schema:
json_schema = json.loads(json_schema)

if not content.strip():
abort(400, description='Missing prompt content')

response_content = chat_with_model(model, content, llama_mode, system_prompt, image_files)
response_content = chat_with_model(model, content, llama_mode, system_prompt, image_files, json_schema)
return jsonify(response_content)


Expand Down
33 changes: 30 additions & 3 deletions test/test_cli_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@ def test_cli_mode_uses_llamacpp_when_available(self):

assert result == "CLI response from DeepSeek V3"
self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
self.mock_chat_llamacpp.assert_called_once_with(TEST_LLAMACPP_MODEL, 'Write a function', system_prompt=None, image_files=None)
self.mock_chat_llamacpp.assert_called_once_with(
TEST_LLAMACPP_MODEL, 'Write a function', system_prompt=None, image_files=None, json_schema=None
)

def test_cli_mode_fallback_to_ollama_when_unavailable(self):
"""Test CLI mode falls back to ollama when model not available in llama.cpp."""
Expand All @@ -119,7 +121,9 @@ def test_cli_mode_fallback_to_ollama_when_unavailable(self):

assert result == "Ollama response from DeepSeek Coder"
self.mock_available.assert_called_once_with(TEST_OLLAMA_MODEL)
self.mock_chat_ollama.assert_called_once_with(TEST_OLLAMA_MODEL, 'Help with coding', system_prompt=None, image_files=None)
self.mock_chat_ollama.assert_called_once_with(
TEST_OLLAMA_MODEL, 'Help with coding', system_prompt=None, image_files=None, json_schema=None
)

def test_default_mode_is_cli(self):
"""Test that default mode is CLI when no llama_mode specified."""
Expand All @@ -130,7 +134,9 @@ def test_default_mode_is_cli(self):

assert result == "Default CLI mode response"
self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
self.mock_chat_llamacpp.assert_called_once_with(TEST_LLAMACPP_MODEL, 'Help me', system_prompt=None, image_files=None)
self.mock_chat_llamacpp.assert_called_once_with(
TEST_LLAMACPP_MODEL, 'Help me', system_prompt=None, image_files=None, json_schema=None
)


class TestCLIModeIntegration:
Expand Down Expand Up @@ -167,3 +173,24 @@ def test_complete_cli_fallback_flow_to_ollama(self, mock_glob, mock_ollama):
assert result == "Ollama CLI fallback integration test successful!"
mock_glob.assert_called_once_with(f'/data1/GGUF/{TEST_OLLAMA_MODEL}/*.gguf')
mock_ollama.assert_called_once()

def test_cli_mode_passes_json_schema_to_ollama(self, tmp_path):
"""
When json_schema is supplied, chat_with_model should forward the parsed
schema (as a dict) to chat_with_ollama.
"""
test_schema = {"schema": {"type": "object", "properties": {"answer": {"type": "string"}}}}

# Prepare mocks
with patch('ai_server.server.is_llamacpp_available', return_value=False), patch(
'ai_server.server.chat_with_ollama'
) as mock_ollama:
mock_ollama.return_value = "schema-aware response"

result = chat_with_model(TEST_OLLAMA_MODEL, "Give me an answer", llama_mode='cli', json_schema=test_schema)

assert result == "schema-aware response"

mock_ollama.assert_called_once_with(
TEST_OLLAMA_MODEL, "Give me an answer", system_prompt=None, image_files=None, json_schema=test_schema
)
21 changes: 21 additions & 0 deletions test/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def test_chat_with_ollama_success(self, mock_ollama):
model=TEST_OLLAMA_MODEL,
messages=[{'role': 'user', 'content': 'Help me write a Python function'}],
stream=False,
format=None,
)

def test_chat_with_ollama_service_unavailable(self, mock_ollama):
Expand All @@ -98,3 +99,23 @@ def test_chat_with_ollama_model_not_found(self, mock_ollama):

with pytest.raises(Exception, match="model 'nonexistent:latest' not found"):
chat_with_ollama('nonexistent:latest', 'Hello')

def test_chat_with_ollama_with_json_schema(self, mock_ollama, tmp_path):
"""Ollama chat should forward the JSON schema (format=…) when provided."""
# Fake schema file
test_schema = {"schema": {"type": "object", "properties": {"answer": {"type": "string"}}}}

# Mock ollama response
mock_response = MagicMock()
mock_response.message.content = "42"
mock_ollama.return_value = mock_response

result = chat_with_ollama(TEST_OLLAMA_MODEL, "What is the meaning of life?", json_schema=test_schema)

assert result == "42"
mock_ollama.assert_called_once_with(
model=TEST_OLLAMA_MODEL,
messages=[{"role": "user", "content": "What is the meaning of life?"}],
stream=False,
format={"type": "object", "properties": {"answer": {"type": "string"}}},
)
37 changes: 35 additions & 2 deletions test/test_server_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ def test_server_mode_uses_llamacpp_when_available(self):

assert result == "Server response from DeepSeek V3"
self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
self.mock_chat_server.assert_called_once_with(TEST_LLAMACPP_MODEL, 'Explain code', system_prompt=None, image_files=None)
self.mock_chat_server.assert_called_once_with(
TEST_LLAMACPP_MODEL, 'Explain code', system_prompt=None, image_files=None, json_schema=None
)

def test_server_mode_fallback_to_ollama_when_unavailable(self):
"""Test server mode falls back to ollama when model not available in llama.cpp."""
Expand All @@ -122,7 +124,9 @@ def test_server_mode_fallback_to_ollama_when_unavailable(self):

assert result == "Ollama fallback response"
self.mock_available.assert_called_once_with(TEST_OLLAMA_MODEL)
self.mock_chat_ollama.assert_called_once_with(TEST_OLLAMA_MODEL, 'Debug code', system_prompt=None, image_files=None)
self.mock_chat_ollama.assert_called_once_with(
TEST_OLLAMA_MODEL, 'Debug code', system_prompt=None, image_files=None, json_schema=None
)

def test_server_mode_requires_server_url(self):
"""Test server mode requires LLAMA_SERVER_URL to be set."""
Expand Down Expand Up @@ -178,3 +182,32 @@ def test_complete_server_fallback_flow_to_ollama(self, mock_glob, mock_ollama, m
assert result == "Ollama server fallback integration test successful!"
mock_glob.assert_called_once_with(f'/data1/GGUF/{TEST_OLLAMA_MODEL}/*.gguf')
mock_ollama.assert_called_once()

def test_server_mode_passes_json_schema_to_llama_server(self, tmp_path, mock_requests_post, mock_llama_server_url):
"""
chat_with_model (server mode) should forward a json_schema file path
and llama-server should receive the parsed schema in its JSON body.
"""
test_schema = {"schema": {"type": "object", "properties": {"answer": {"type": "string"}}}}

with patch('ai_server.server.is_llamacpp_available', return_value=True):
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {"choices": [{"message": {"content": "Schema-aware server reply"}}]}
mock_requests_post.return_value = mock_response

result = chat_with_model(
TEST_LLAMACPP_MODEL, "Give me an answer", llama_mode="server", json_schema=test_schema
)

assert result == "Schema-aware server reply"

# Verify POST call
mock_requests_post.assert_called_once()
args, kwargs = mock_requests_post.call_args
assert args[0] == "http://localhost:8080/v1/chat/completions"

body = kwargs["json"]
assert body["model"] == TEST_LLAMACPP_MODEL
assert body["messages"][0]["content"] == "Give me an answer"
assert body["json_schema"] == {"type": "object", "properties": {"answer": {"type": "string"}}}
4 changes: 3 additions & 1 deletion test/test_system_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,6 @@ def test_chat_with_model_routing(self, mock_available, mock_chat):
mock_chat.return_value = "result"

chat_with_model(TEST_MODEL, TEST_USER_CONTENT, 'cli', TEST_SYSTEM_PROMPT)
mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, system_prompt=TEST_SYSTEM_PROMPT, image_files=None)
mock_chat.assert_called_once_with(
TEST_MODEL, TEST_USER_CONTENT, system_prompt=TEST_SYSTEM_PROMPT, image_files=None, json_schema=None
)
4 changes: 2 additions & 2 deletions test/test_system_prompt_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_api_with_system_prompt(self, mock_chat, mock_redis, client):

assert response.status_code == 200

mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, 'cli', TEST_SYSTEM_PROMPT, [])
mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, 'cli', TEST_SYSTEM_PROMPT, [], None)

@patch('ai_server.server.REDIS_CONNECTION')
@patch('ai_server.server.chat_with_model')
Expand All @@ -54,7 +54,7 @@ def test_api_without_system_prompt(self, mock_chat, mock_redis, client):

assert response.status_code == 200

mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, 'cli', None, [])
mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, 'cli', None, [], None)

@patch('ai_server.server.REDIS_CONNECTION')
def test_api_authentication_still_required(self, mock_redis, client):
Expand Down