stacklok · lukehinds · Feb 1, 2025 · Jan 31, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -51,6 +51,11 @@ jobs:
           echo "Loaded image:"
           docker images
 
+      - name: Download the Qwen2.5-Coder-0.5B-Instruct-GGUF model
+        run: |
+          # This is needed for the llamacpp integration tests
+          wget -P ./codegate_volume/models https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-0.5b-instruct-q5_k_m.gguf
+
       - name: Run container from the loaded image
         run: |
           # Get the image name
@@ -235,6 +240,12 @@ jobs:
         run: |
           poetry run python tests/integration/integration_tests.py
 
+      - name: Run integration tests - llamacpp
+        env:
+          CODEGATE_PROVIDERS: "llamacpp"
+        run: |
+          poetry run python tests/integration/integration_tests.py
+
       - name: Print the CodeGate container logs (useful for debugging)
         if: always()
         run: |

diff --git a/src/codegate/providers/llamacpp/completion_handler.py b/src/codegate/providers/llamacpp/completion_handler.py
@@ -59,19 +59,25 @@ async def execute_completion(
         """
         model_path = f"{Config.get_config().model_base_path}/{request['model']}.gguf"
 
+        # Create a copy of the request dict and remove stream_options
+        # Reason - Request error as JSON:
+        # {'error': "Llama.create_completion() got an unexpected keyword argument 'stream_options'"}
+        request_dict = dict(request)
+        request_dict.pop("stream_options", None)
+
         if is_fim_request:
             response = await self.inference_engine.complete(
                 model_path,
                 Config.get_config().chat_model_n_ctx,
                 Config.get_config().chat_model_n_gpu_layers,
-                **request,
+                **request_dict,
             )
         else:
             response = await self.inference_engine.chat(
                 model_path,
                 Config.get_config().chat_model_n_ctx,
                 Config.get_config().chat_model_n_gpu_layers,
-                **request,
+                **request_dict,
             )
 
         return convert_to_async_iterator(response) if stream else response

diff --git a/tests/integration/integration_tests.py b/tests/integration/integration_tests.py
@@ -67,17 +67,25 @@ def parse_response_message(response, streaming=True):
                     if "DONE" in decoded_line or "message_stop" in decoded_line:
                         break
 
-                    decoded_line = decoded_line.replace("data:", "")
+                    decoded_line = decoded_line.replace("data:", "").strip()
                     json_line = json.loads(decoded_line)
-
                     message_content = None
+                    # Handle both chat and FIM responses
                     if "choices" in json_line:
-                        if "finish_reason" in json_line["choices"][0]:
+                        choice = json_line["choices"][0]
+                        # Break if the conversation is over
+                        if choice.get("finish_reason") == "stop":
                             break
-                        if "delta" in json_line["choices"][0]:
-                            message_content = json_line["choices"][0]["delta"].get("content", "")
-                        elif "text" in json_line["choices"][0]:
-                            message_content = json_line["choices"][0].get("text", "")
+                        # Handle chat responses
+                        if "delta" in choice:
+                            delta = choice["delta"]
+                            if "content" in delta and delta["content"] is not None:
+                                message_content = delta["content"]
+                        # Handle FIM responses
+                        elif "text" in choice:
+                            text = choice["text"]
+                            if text is not None:
+                                message_content = text
                     elif "delta" in json_line:
                         message_content = json_line["delta"].get("text", "")
                     elif "message" in json_line:
@@ -87,7 +95,6 @@ def parse_response_message(response, streaming=True):
 
                     if message_content is not None:
                         response_message += message_content
-
             else:
                 if "choices" in response.json():
                     response_message = response.json()["choices"][0]["message"].get("content", "")
@@ -97,7 +104,8 @@ def parse_response_message(response, streaming=True):
         except Exception as e:
             logger.exception("An error occurred: %s", e)
 
-        return response_message
+        # Remove any trailing newlines and return
+        return response_message.strip()
 
     @staticmethod
     def replace_env_variables(input_string, env):

diff --git a/tests/integration/testcases.yaml b/tests/integration/testcases.yaml
@@ -6,6 +6,7 @@ headers:
   ollama:
     Content-Type: application/json
   llamacpp:
+    Content-Type: application/json
   anthropic:
     x-api-key: ENV_ANTHROPIC_KEY
   copilot:
@@ -68,7 +69,7 @@ testcases:
               "role":"user"
             }
         ],
-        "model":"qwen2.5-coder-1.5b-instruct-q5_k_m",
+        "model":"qwen2.5-coder-0.5b-instruct-q5_k_m",
         "stream":true,
         "temperature":0
       }
@@ -81,18 +82,15 @@ testcases:
     url: http://127.0.0.1:8989/llamacpp/completions
     data: |
       {
-        "model": "qwen2.5-coder-1.5b-instruct-q5_k_m",
+        "model": "qwen2.5-coder-0.5b-instruct-q5_k_m",
         "max_tokens": 4096,
         "temperature": 0,
         "stream": true,
-        "stop": ["<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|fim_pad|>", "<|repo_name|>", "<|file_sep|>", "<|im_start|>", "<|im_end|>", "/src/", "#- coding: utf-8", "```"],
-        "prompt":"<|fim_prefix|>\n# codegate/test.py\nimport invokehttp\nimport requests\n\nkey = \"mysecret-key\"\n\ndef call_api():\n    <|fim_suffix|>\n\n\ndata = {'key1': 'test1', 'key2': 'test2'}\nresponse = call_api('http://localhost:8080', method='post', data='data')\n<|fim_middle|>"
+        "stop": ["<|endoftext|>", "<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|fim_pad|>", "<|repo_name|>", "<|file_sep|>", "<|im_start|>", "<|im_end|>", "/src/", "#- coding: utf-8", "```", "def test"],
+        "prompt":"# Do not add comments\n<|fim_prefix|>\n# codegate/greet.py\ndef print_hello():\n    <|fim_suffix|>\n\n\nprint_hello()\n<|fim_middle|>"
       }
     likes: |
-      url = 'http://localhost:8080'
-      headers = {'Authorization': f'Bearer {key}'}
-      response = requests.get(url, headers=headers)
-      return response.json()
+      print("Hello, World!")
 
   openai_chat:
     name: OpenAI Chat