Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 38 additions & 5 deletions ai_server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,23 @@ def chat_with_llama_server_http(
timeout: int = 300,
image_files: Optional[list] = None,
json_schema: Optional[dict] = None,
model_options: Optional[dict] = None,
) -> str:
"""Handle chat using llama-server HTTP API."""
if not LLAMA_SERVER_URL:
raise Exception("LLAMA_SERVER_URL environment variable not set")

try:
messages = _build_messages(content, system_prompt, image_files=[]) # TODO: Pass image files
payload = {'model': model, 'messages': messages, 'stream': False, 'max_tokens': 512}

if not model_options:
model_options = {}

payload = {'model': model, 'messages': messages, **model_options}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please keep the stream option hard coded. We can allow the client to override the default value, but we should set a False default.

if json_schema:
payload['json_schema'] = json_schema[SCHEMA_KEY]
if 'stream' not in payload:
payload['stream'] = False

response = requests.post(
f'{LLAMA_SERVER_URL}/v1/chat/completions',
Expand Down Expand Up @@ -106,6 +113,7 @@ def chat_with_ollama(
system_prompt: Optional[str] = None,
image_files: Optional[list] = None,
json_schema: Optional[dict] = None,
model_options: Optional[dict] = None,
) -> str:
"""Handle chat using ollama."""
messages = _build_messages(content, system_prompt, image_files)
Expand All @@ -115,6 +123,7 @@ def chat_with_ollama(
messages=messages,
stream=False,
format=json_schema[SCHEMA_KEY] if json_schema else None,
options=model_options,
)
return response.message.content

Expand All @@ -125,6 +134,7 @@ def chat_with_llamacpp(
system_prompt: Optional[str] = None,
timeout: int = 300,
image_files: Optional[list] = None,
model_options: Optional[dict] = None,
json_schema: Optional[dict] = None,
) -> str:
"""Handle chat using llama.cpp CLI."""
Expand All @@ -142,6 +152,10 @@ def chat_with_llamacpp(
if system_prompt:
cmd.extend(['--system-prompt', system_prompt])

if model_options:
for key, value in model_options.items():
cmd.extend(['--model-option', key, value])

if image_files:
pass # TODO: pass image files

Expand Down Expand Up @@ -171,6 +185,7 @@ def chat_with_model(
llama_mode: str = "cli",
system_prompt: Optional[str] = None,
image_files: Optional[list] = None,
model_options: Optional[dict] = None,
json_schema: Optional[dict] = None,
) -> str:
"""Route chat request based on llama_mode: server (external), cli, or ollama fallback; and with optional system prompt."""
Expand All @@ -179,18 +194,33 @@ def chat_with_model(
if not LLAMA_SERVER_URL:
raise Exception("LLAMA_SERVER_URL environment variable not set for server mode")
return chat_with_llama_server_http(
model, content, system_prompt=system_prompt, image_files=image_files, json_schema=json_schema
model,
content,
system_prompt=system_prompt,
image_files=image_files,
json_schema=json_schema,
model_options=model_options,
)
elif llama_mode == "cli":
return chat_with_llamacpp(
model, content, system_prompt=system_prompt, image_files=image_files, json_schema=json_schema
model,
content,
system_prompt=system_prompt,
image_files=image_files,
json_schema=json_schema,
model_options=model_options,
)
else:
raise ValueError(f"Invalid llama_mode: '{llama_mode}'. Valid options are 'server' or 'cli'.")
else:
# Model not available in llama.cpp, use ollama
return chat_with_ollama(
model, content, system_prompt=system_prompt, image_files=image_files, json_schema=json_schema
model,
content,
system_prompt=system_prompt,
image_files=image_files,
json_schema=json_schema,
model_options=model_options,
)


Expand All @@ -216,14 +246,17 @@ def chat():
llama_mode = request.form.get('llama_mode', 'cli')
system_prompt = request.form.get('system_prompt')
image_files = list(request.files.values())
model_options = request.form.get('model_options')
json_schema = request.form.get('json_schema')
if json_schema:
json_schema = json.loads(json_schema)

if not content.strip():
abort(400, description='Missing prompt content')

response_content = chat_with_model(model, content, llama_mode, system_prompt, image_files, json_schema)
response_content = chat_with_model(
model, content, llama_mode, system_prompt, image_files, model_options=model_options, json_schema=json_schema
)
return jsonify(response_content)


Expand Down
46 changes: 42 additions & 4 deletions test/test_cli_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,12 @@ def test_cli_mode_uses_llamacpp_when_available(self):
assert result == "CLI response from DeepSeek V3"
self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
self.mock_chat_llamacpp.assert_called_once_with(
TEST_LLAMACPP_MODEL, 'Write a function', system_prompt=None, image_files=None, json_schema=None
TEST_LLAMACPP_MODEL,
'Write a function',
system_prompt=None,
image_files=None,
json_schema=None,
model_options=None,
)

def test_cli_mode_fallback_to_ollama_when_unavailable(self):
Expand All @@ -122,7 +127,12 @@ def test_cli_mode_fallback_to_ollama_when_unavailable(self):
assert result == "Ollama response from DeepSeek Coder"
self.mock_available.assert_called_once_with(TEST_OLLAMA_MODEL)
self.mock_chat_ollama.assert_called_once_with(
TEST_OLLAMA_MODEL, 'Help with coding', system_prompt=None, image_files=None, json_schema=None
TEST_OLLAMA_MODEL,
'Help with coding',
system_prompt=None,
image_files=None,
json_schema=None,
model_options=None,
)

def test_default_mode_is_cli(self):
Expand All @@ -135,7 +145,30 @@ def test_default_mode_is_cli(self):
assert result == "Default CLI mode response"
self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
self.mock_chat_llamacpp.assert_called_once_with(
TEST_LLAMACPP_MODEL, 'Help me', system_prompt=None, image_files=None, json_schema=None
TEST_LLAMACPP_MODEL, 'Help me', system_prompt=None, image_files=None, json_schema=None, model_options=None
)
self.mock_chat_llamacpp.assert_called_once_with(
TEST_LLAMACPP_MODEL, 'Help me', system_prompt=None, image_files=None, model_options=None, json_schema=None
)

def test_model_options(self):
"""Test that model_options are passed correctly in CLI mode."""
self.mock_available.return_value = True
self.mock_chat_llamacpp.return_value = "Model options test response"

test_options = {"temperature": 0.7, "top_p": 0.9}

result = chat_with_model(TEST_LLAMACPP_MODEL, 'Help me', model_options=test_options)

assert result == "Model options test response"
self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
self.mock_chat_llamacpp.assert_called_once_with(
TEST_LLAMACPP_MODEL,
'Help me',
system_prompt=None,
image_files=None,
json_schema=None,
model_options=test_options,
)


Expand Down Expand Up @@ -192,5 +225,10 @@ def test_cli_mode_passes_json_schema_to_ollama(self, tmp_path):
assert result == "schema-aware response"

mock_ollama.assert_called_once_with(
TEST_OLLAMA_MODEL, "Give me an answer", system_prompt=None, image_files=None, json_schema=test_schema
TEST_OLLAMA_MODEL,
"Give me an answer",
system_prompt=None,
image_files=None,
json_schema=test_schema,
model_options=None,
)
8 changes: 6 additions & 2 deletions test/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def test_chat_with_ollama_success(self, mock_ollama):
model=TEST_OLLAMA_MODEL,
messages=[{'role': 'user', 'content': 'Help me write a Python function'}],
stream=False,
options=None,
format=None,
)

Expand Down Expand Up @@ -110,12 +111,15 @@ def test_chat_with_ollama_with_json_schema(self, mock_ollama, tmp_path):
mock_response.message.content = "42"
mock_ollama.return_value = mock_response

result = chat_with_ollama(TEST_OLLAMA_MODEL, "What is the meaning of life?", json_schema=test_schema)
result = chat_with_ollama(
TEST_OLLAMA_MODEL, "What is the meaning of life?", json_schema=test_schema, model_options=None
)

assert result == "42"
mock_ollama.assert_called_once_with(
model=TEST_OLLAMA_MODEL,
messages=[{"role": "user", "content": "What is the meaning of life?"}],
stream=False,
format={"type": "object", "properties": {"answer": {"type": "string"}}},
format={'type': 'object', 'properties': {'answer': {'type': 'string'}}},
options=None,
)
32 changes: 30 additions & 2 deletions test/test_server_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,12 @@ def test_server_mode_uses_llamacpp_when_available(self):
assert result == "Server response from DeepSeek V3"
self.mock_available.assert_called_once_with(TEST_LLAMACPP_MODEL)
self.mock_chat_server.assert_called_once_with(
TEST_LLAMACPP_MODEL, 'Explain code', system_prompt=None, image_files=None, json_schema=None
TEST_LLAMACPP_MODEL,
'Explain code',
system_prompt=None,
image_files=None,
json_schema=None,
model_options=None,
)

def test_server_mode_fallback_to_ollama_when_unavailable(self):
Expand All @@ -125,7 +130,30 @@ def test_server_mode_fallback_to_ollama_when_unavailable(self):
assert result == "Ollama fallback response"
self.mock_available.assert_called_once_with(TEST_OLLAMA_MODEL)
self.mock_chat_ollama.assert_called_once_with(
TEST_OLLAMA_MODEL, 'Debug code', system_prompt=None, image_files=None, json_schema=None
TEST_OLLAMA_MODEL, 'Debug code', system_prompt=None, image_files=None, json_schema=None, model_options=None
)
self.mock_chat_ollama.assert_called_once_with(
TEST_OLLAMA_MODEL, 'Debug code', system_prompt=None, image_files=None, json_schema=None, model_options=None
)

def test_server_mode_fallback_to_ollama_with_model_options(self):
"""Test server mode falls back to ollama and passes model_options correctly when model is unavailable in llama.cpp."""
self.mock_available.return_value = False
self.mock_chat_ollama.return_value = "Ollama fallback response"

test_options = {"temperature": 0.3, "top_k": 50}

result = chat_with_model(TEST_OLLAMA_MODEL, 'Debug code', llama_mode='server', model_options=test_options)

assert result == "Ollama fallback response"
self.mock_available.assert_called_once_with(TEST_OLLAMA_MODEL)
self.mock_chat_ollama.assert_called_once_with(
TEST_OLLAMA_MODEL,
'Debug code',
system_prompt=None,
image_files=None,
model_options=test_options,
json_schema=None,
)

def test_server_mode_requires_server_url(self):
Expand Down
7 changes: 6 additions & 1 deletion test/test_system_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,5 +78,10 @@ def test_chat_with_model_routing(self, mock_available, mock_chat):

chat_with_model(TEST_MODEL, TEST_USER_CONTENT, 'cli', TEST_SYSTEM_PROMPT)
mock_chat.assert_called_once_with(
TEST_MODEL, TEST_USER_CONTENT, system_prompt=TEST_SYSTEM_PROMPT, image_files=None, json_schema=None
TEST_MODEL,
TEST_USER_CONTENT,
system_prompt=TEST_SYSTEM_PROMPT,
image_files=None,
json_schema=None,
model_options=None,
)
9 changes: 7 additions & 2 deletions test/test_system_prompt_api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from unittest.mock import patch

import pytest
Expand Down Expand Up @@ -39,7 +40,9 @@ def test_api_with_system_prompt(self, mock_chat, mock_redis, client):

assert response.status_code == 200

mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, 'cli', TEST_SYSTEM_PROMPT, [], None)
mock_chat.assert_called_once_with(
TEST_MODEL, TEST_USER_CONTENT, 'cli', TEST_SYSTEM_PROMPT, [], json_schema=None, model_options=None
)

@patch('ai_server.server.REDIS_CONNECTION')
@patch('ai_server.server.chat_with_model')
Expand All @@ -54,7 +57,9 @@ def test_api_without_system_prompt(self, mock_chat, mock_redis, client):

assert response.status_code == 200

mock_chat.assert_called_once_with(TEST_MODEL, TEST_USER_CONTENT, 'cli', None, [], None)
mock_chat.assert_called_once_with(
TEST_MODEL, TEST_USER_CONTENT, 'cli', None, [], model_options=None, json_schema=None
)

@patch('ai_server.server.REDIS_CONNECTION')
def test_api_authentication_still_required(self, mock_redis, client):
Expand Down