Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Enable the integration tests for llamacpp #868

Merged
merged 3 commits into from
Feb 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ jobs:
echo "Loaded image:"
docker images

- name: Download the Qwen2.5-Coder-0.5B-Instruct-GGUF model
run: |
# This is needed for the llamacpp integration tests
wget -P ./codegate_volume/models https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-0.5b-instruct-q5_k_m.gguf

- name: Run container from the loaded image
run: |
# Get the image name
Expand Down Expand Up @@ -235,6 +240,12 @@ jobs:
run: |
poetry run python tests/integration/integration_tests.py

- name: Run integration tests - llamacpp
env:
CODEGATE_PROVIDERS: "llamacpp"
run: |
poetry run python tests/integration/integration_tests.py

- name: Print the CodeGate container logs (useful for debugging)
if: always()
run: |
Expand Down
10 changes: 8 additions & 2 deletions src/codegate/providers/llamacpp/completion_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,19 +59,25 @@ async def execute_completion(
"""
model_path = f"{Config.get_config().model_base_path}/{request['model']}.gguf"

# Create a copy of the request dict and remove stream_options
# Reason - Request error as JSON:
# {'error': "Llama.create_completion() got an unexpected keyword argument 'stream_options'"}
request_dict = dict(request)
request_dict.pop("stream_options", None)

if is_fim_request:
response = await self.inference_engine.complete(
model_path,
Config.get_config().chat_model_n_ctx,
Config.get_config().chat_model_n_gpu_layers,
**request,
**request_dict,
)
else:
response = await self.inference_engine.chat(
model_path,
Config.get_config().chat_model_n_ctx,
Config.get_config().chat_model_n_gpu_layers,
**request,
**request_dict,
)

return convert_to_async_iterator(response) if stream else response
Expand Down
26 changes: 17 additions & 9 deletions tests/integration/integration_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,25 @@ def parse_response_message(response, streaming=True):
if "DONE" in decoded_line or "message_stop" in decoded_line:
break

decoded_line = decoded_line.replace("data:", "")
decoded_line = decoded_line.replace("data:", "").strip()
json_line = json.loads(decoded_line)

message_content = None
# Handle both chat and FIM responses
if "choices" in json_line:
if "finish_reason" in json_line["choices"][0]:
choice = json_line["choices"][0]
# Break if the conversation is over
if choice.get("finish_reason") == "stop":
break
if "delta" in json_line["choices"][0]:
message_content = json_line["choices"][0]["delta"].get("content", "")
elif "text" in json_line["choices"][0]:
message_content = json_line["choices"][0].get("text", "")
# Handle chat responses
if "delta" in choice:
delta = choice["delta"]
if "content" in delta and delta["content"] is not None:
message_content = delta["content"]
# Handle FIM responses
elif "text" in choice:
text = choice["text"]
if text is not None:
message_content = text
elif "delta" in json_line:
message_content = json_line["delta"].get("text", "")
elif "message" in json_line:
Expand All @@ -87,7 +95,6 @@ def parse_response_message(response, streaming=True):

if message_content is not None:
response_message += message_content

else:
if "choices" in response.json():
response_message = response.json()["choices"][0]["message"].get("content", "")
Expand All @@ -97,7 +104,8 @@ def parse_response_message(response, streaming=True):
except Exception as e:
logger.exception("An error occurred: %s", e)

return response_message
# Remove any trailing newlines and return
return response_message.strip()

@staticmethod
def replace_env_variables(input_string, env):
Expand Down
14 changes: 6 additions & 8 deletions tests/integration/testcases.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ headers:
ollama:
Content-Type: application/json
llamacpp:
Content-Type: application/json
anthropic:
x-api-key: ENV_ANTHROPIC_KEY
copilot:
Expand Down Expand Up @@ -68,7 +69,7 @@ testcases:
"role":"user"
}
],
"model":"qwen2.5-coder-1.5b-instruct-q5_k_m",
"model":"qwen2.5-coder-0.5b-instruct-q5_k_m",
"stream":true,
"temperature":0
}
Expand All @@ -81,18 +82,15 @@ testcases:
url: http://127.0.0.1:8989/llamacpp/completions
data: |
{
"model": "qwen2.5-coder-1.5b-instruct-q5_k_m",
"model": "qwen2.5-coder-0.5b-instruct-q5_k_m",
"max_tokens": 4096,
"temperature": 0,
"stream": true,
"stop": ["<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|fim_pad|>", "<|repo_name|>", "<|file_sep|>", "<|im_start|>", "<|im_end|>", "/src/", "#- coding: utf-8", "```"],
"prompt":"<|fim_prefix|>\n# codegate/test.py\nimport invokehttp\nimport requests\n\nkey = \"mysecret-key\"\n\ndef call_api():\n <|fim_suffix|>\n\n\ndata = {'key1': 'test1', 'key2': 'test2'}\nresponse = call_api('http://localhost:8080', method='post', data='data')\n<|fim_middle|>"
"stop": ["<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|fim_pad|>", "<|repo_name|>", "<|file_sep|>", "<|im_start|>", "<|im_end|>", "/src/", "#- coding: utf-8", "```", "def test"],
"prompt":"# Do not add comments\n<|fim_prefix|>\n# codegate/greet.py\ndef print_hello():\n <|fim_suffix|>\n\n\nprint_hello()\n<|fim_middle|>"
}
likes: |
url = 'http://localhost:8080'
headers = {'Authorization': f'Bearer {key}'}
response = requests.get(url, headers=headers)
return response.json()
print("Hello, World!")

openai_chat:
name: OpenAI Chat
Expand Down