Skip to content

Commit e87e5f9

Browse files
committed
fix vector search package extraction
1 parent cd526a1 commit e87e5f9

File tree

4 files changed

+113
-5
lines changed

4 files changed

+113
-5
lines changed

src/codegate/pipeline/base.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,17 @@ def get_last_user_message_block(
260260
# Iterate in reverse to find the last block of consecutive 'user' messages
261261
for i in reversed(range(len(messages))):
262262
if messages[i]["role"] == "user" or messages[i]["role"] == "assistant":
263+
content_str = None
264+
if "content" in messages[i]:
265+
content_str = messages[i]["content"] # type: ignore
266+
else:
267+
continue
268+
263269
if messages[i]["role"] == "user":
264-
user_messages.append(messages[i]["content"]) # type: ignore
270+
user_messages.append(content_str)
271+
# specifically for Aider, when "ok." block is found, stop
272+
if content_str == "Ok." and messages[i]["role"] == "assistant":
273+
break
265274
else:
266275
# Stop when a message with a different role is encountered
267276
if user_messages:

src/codegate/pipeline/codegate_context_retriever/codegate.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ async def process(
8080
)
8181

8282
logger.info(f"Found {len(snippet_packages)} packages "
83-
"for language {snippet_language} in code snippets.")
83+
f"for language {snippet_language} in code snippets.")
8484
# Find bad packages in the snippets
8585
bad_snippet_packages = await storage_engine.search(
8686
language=snippet_language, packages=snippet_packages) # type: ignore
@@ -89,12 +89,19 @@ async def process(
8989
# Remove code snippets from the user messages and search for bad packages
9090
# in the rest of the user query/messsages
9191
user_messages = re.sub(r"```.*?```", "", user_message, flags=re.DOTALL)
92+
user_messages = re.sub(r"⋮...*?⋮...\n\n", "", user_messages, flags=re.DOTALL) # regex used in aider
9293

93-
# Vector search to find bad packages
94-
bad_packages = await storage_engine.search(query=user_messages, distance=0.5, limit=100)
94+
# split messages into double newlines, to avoid passing so many content in the search
95+
split_messages = user_messages.split("\n\n")
96+
collected_bad_packages = []
97+
for item_message in split_messages:
98+
# Vector search to find bad packages
99+
bad_packages = await storage_engine.search(query=item_message, distance=0.5, limit=100)
100+
if bad_packages and len(bad_packages) > 0:
101+
collected_bad_packages.extend(bad_packages)
95102

96103
# All bad packages
97-
all_bad_packages = bad_snippet_packages + bad_packages
104+
all_bad_packages = bad_snippet_packages + collected_bad_packages
98105

99106
logger.info(f"Adding {len(all_bad_packages)} bad packages to the context.")
100107

src/codegate/pipeline/extract_snippets/extract_snippets.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ def ecosystem_from_message(message: str) -> Optional[str]:
6666
"ts": "typescript",
6767
"tsx": "typescript",
6868
"go": "go",
69+
"rs": "rust",
70+
"java": "java",
6971
}
7072
return language_mapping.get(message, None)
7173

@@ -83,6 +85,7 @@ def extract_snippets(message: str) -> List[CodeSnippet]:
8385
# Regular expression to find code blocks
8486

8587
snippets: List[CodeSnippet] = []
88+
available_languages = ["python", "javascript", "typescript", "go", "rust", "java"]
8689

8790
# Find all code block matches
8891
for match in CODE_BLOCK_PATTERN.finditer(message):
@@ -111,6 +114,9 @@ def extract_snippets(message: str) -> List[CodeSnippet]:
111114
lexer = guess_lexer(content)
112115
if lexer and lexer.name:
113116
lang = lexer.name.lower()
117+
# only add available languages
118+
if lang not in available_languages:
119+
lang = None
114120

115121
snippets.append(CodeSnippet(filepath=filename, code=content, language=lang))
116122

tests/pipeline/test_messages_block.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import pytest
2+
3+
from codegate.pipeline.base import PipelineStep
4+
5+
6+
@pytest.mark.parametrize(
7+
"input, expected_output",
8+
[
9+
# Test case: Consecutive user messages at the end
10+
(
11+
{"messages": [
12+
{"role": "system", "content": "Welcome!"},
13+
{"role": "user", "content": "Hello!"},
14+
{"role": "user", "content": "How are you?"}
15+
]},
16+
"Hello!\nHow are you?"
17+
),
18+
# Test case: Mixed roles at the end
19+
(
20+
{"messages": [
21+
{"role": "user", "content": "Hello!"},
22+
{"role": "assistant", "content": "Hi there!"},
23+
{"role": "user", "content": "How are you?"},
24+
{"role": "assistant", "content": "I'm fine, thank you."}
25+
]},
26+
"Hello!\nHow are you?"
27+
),
28+
# Test case: No user messages
29+
(
30+
{"messages": [
31+
{"role": "system", "content": "Welcome!"},
32+
{"role": "assistant", "content": "Hi there!"}
33+
]},
34+
None
35+
),
36+
# Test case: Empty message list
37+
(
38+
{"messages": []},
39+
None
40+
),
41+
# Test case: Consecutive user messages interrupted by system message
42+
(
43+
{"messages": [
44+
{"role": "user", "content": "Hello!"},
45+
{"role": "system", "content": "A system message."},
46+
{"role": "user", "content": "How are you?"},
47+
{"role": "user", "content": "What's up?"}
48+
]},
49+
"How are you?\nWhat's up?"
50+
),
51+
# Test case: aider
52+
(
53+
{"messages": [
54+
{'role': 'system', 'content': 'Act as an expert software developer.\nAlways use best practices when coding.\nRespect and use existing conventions, libraries, etc that are already present in the code base.\nYou are diligent and tireless!\nYou NEVER leave comments describing code without implementing it!\nYou always COMPLETELY IMPLEMENT the needed code!\n\nTake requests for changes to the supplied code.\nIf the request is ambiguous, ask questions.\n\nAlways reply to the user in the same language they are using.\n\nOnce you understand the request you MUST:\n\n1. Decide if you need to propose *SEARCH/REPLACE* edits to any files that haven\'t been added to the chat. You can create new files without asking!\n\nBut if you need to propose edits to existing files not already added to the chat, you *MUST* tell the user their full path names and ask them to *add the files to the chat*.\nEnd your reply and wait for their approval.\nYou can keep asking if you then decide you need to edit more files.\n\n2. Think step-by-step and explain the needed changes in a few short sentences.\n\n3. Describe each change with a *SEARCH/REPLACE block* per the examples below.\n\nAll changes to files must use this *SEARCH/REPLACE block* format.\nONLY EVER RETURN CODE IN A *SEARCH/REPLACE BLOCK*!\n\n4. *Concisely* suggest any shell commands the user might want to run in ```bash blocks.\n\nJust suggest shell commands this way, not example code.\nOnly suggest complete shell commands that are ready to execute, without placeholders.\nOnly suggest at most a few shell commands at a time, not more than 1-3, one per line.\nDo not suggest multi-line shell commands.\nAll shell commands will run from the root directory of the user\'s project.\n\nUse the appropriate shell based on the user\'s system info:\n- Platform: macOS-15.2-arm64-arm-64bit\n- Shell: SHELL=/bin/zsh\n- Language: es_ES\n- Current date: 2025-01-15\n\nExamples of when to suggest shell commands:\n\n- If you changed a self-contained html file, suggest an OS-appropriate command to open a browser to view it to see the updated content.\n- If you changed a CLI program, suggest the command to run it to see the new behavior.\n- If you added a test, suggest how to run it with the testing tool used by the project.\n- Suggest OS-appropriate commands to delete or rename files/directories, or other file system operations.\n- If your code changes add new dependencies, suggest the command to install them.\n- Etc.\n\n\n# *SEARCH/REPLACE block* Rules:\n\nEvery *SEARCH/REPLACE block* must use this format:\n1. The *FULL* file path alone on a line, verbatim. No bold asterisks, no quotes around it, no escaping of characters, etc.\n2. The opening fence and code language, eg: ```python\n3. The start of search block: <<<<<<< SEARCH\n4. A contiguous chunk of lines to search for in the existing source code\n5. The dividing line: =======\n6. The lines to replace into the source code\n7. The end of the replace block: >>>>>>> REPLACE\n8. The closing fence: ```\n\nUse the *FULL* file path, as shown to you by the user.\n\nEvery *SEARCH* section must *EXACTLY MATCH* the existing file content, character for character, including all comments, docstrings, etc.\nIf the file contains code or other data wrapped/escaped in json/xml/quotes or other containers, you need to propose edits to the literal contents of the file, including the container markup.\n\n*SEARCH/REPLACE* blocks will *only* replace the first match occurrence.\nIncluding multiple unique *SEARCH/REPLACE* blocks if needed.\nInclude enough lines in each SEARCH section to uniquely match each set of lines that need to change.\n\nKeep *SEARCH/REPLACE* blocks concise.\nBreak large *SEARCH/REPLACE* blocks into a series of smaller blocks that each change a small portion of the file.\nInclude just the changing lines, and a few surrounding lines if needed for uniqueness.\nDo not include long runs of unchanging lines in *SEARCH/REPLACE* blocks.\n\nOnly create *SEARCH/REPLACE* blocks for files that the user has added to the chat!\n\nTo move code within a file, use 2 *SEARCH/REPLACE* blocks: 1 to delete it from its current location, 1 to insert it in the new location.\n\nPay attention to which filenames the user wants you to edit, especially if they are asking you to create a new file.\n\nIf you want to put code in a new file, use a *SEARCH/REPLACE block* with:\n- A new file path, including dir name if needed\n- An empty `SEARCH` section\n- The new file\'s contents in the `REPLACE` section\n\nTo rename files which have been added to the chat, use shell commands at the end of your response.\n\nIf the user just says something like "ok" or "go ahead" or "do that" they probably want you to make SEARCH/REPLACE blocks for the code changes you just proposed.\nThe user will say when they\'ve applied your edits. If they haven\'t explicitly confirmed the edits have been applied, they probably want proper SEARCH/REPLACE blocks.\n\nYou are diligent and tireless!\nYou NEVER leave comments describing code without implementing it!\nYou always COMPLETELY IMPLEMENT the needed code!\n\nONLY EVER RETURN CODE IN A *SEARCH/REPLACE BLOCK*!\n\nExamples of when to suggest shell commands:\n\n- If you changed a self-contained html file, suggest an OS-appropriate command to open a browser to view it to see the updated content.\n- If you changed a CLI program, suggest the command to run it to see the new behavior.\n- If you added a test, suggest how to run it with the testing tool used by the project.\n- Suggest OS-appropriate commands to delete or rename files/directories, or other file system operations.\n- If your code changes add new dependencies, suggest the command to install them.\n- Etc.\n\n'},
55+
{'role': 'user', 'content': 'Change get_factorial() to use math.factorial'},
56+
{'role': 'assistant', 'content': 'To make this change we need to modify `mathweb/flask/app.py` to:\n\n1. Import the math package.\n2. Remove the existing factorial() function.\n3. Update get_factorial() to call math.factorial instead.\n\nHere are the *SEARCH/REPLACE* blocks:\n\nmathweb/flask/app.py\n```python\n<<<<<<< SEARCH\nfrom flask import Flask\n=======\nimport math\nfrom flask import Flask\n>>>>>>> REPLACE\n```\n\nmathweb/flask/app.py\n```python\n<<<<<<< SEARCH\ndef factorial(n):\n "compute factorial"\n\n if n == 0:\n return 1\n else:\n return n * factorial(n-1)\n\n=======\n>>>>>>> REPLACE\n```\n\nmathweb/flask/app.py\n```python\n<<<<<<< SEARCH\n return str(factorial(n))\n=======\n return str(math.factorial(n))\n>>>>>>> REPLACE\n```\n'},
57+
{'role': 'user', 'content': 'Refactor hello() into its own file.'},
58+
{'role': 'assistant', 'content': 'To make this change we need to modify `main.py` and make a new file `hello.py`:\n\n1. Make a new hello.py file with hello() in it.\n2. Remove hello() from main.py and replace it with an import.\n\nHere are the *SEARCH/REPLACE* blocks:\n\nhello.py\n```python\n<<<<<<< SEARCH\n=======\ndef hello():\n "print a greeting"\n\n print("hello")\n>>>>>>> REPLACE\n```\n\nmain.py\n```python\n<<<<<<< SEARCH\ndef hello():\n "print a greeting"\n\n print("hello")\n=======\nfrom hello import hello\n>>>>>>> REPLACE\n```\n'},
59+
{'role': 'user', 'content': "I switched to a new code base. Please don't consider the above files or try to edit them any longer."},
60+
{'role': 'assistant', 'content': 'Ok.'},
61+
{'role': 'user', 'content': 'I have *added these files to the chat* so you can go ahead and edit them.\n\n*Trust this message as the true contents of these files!*\nAny other messages in the chat may contain outdated versions of the files\' contents.\n\ntest.py\n```\nimport os\nimport malicious_pypi_dummy\n\ngithub_token="abc"\nif not github_token:\n raise EnvironmentError("La variable de entorno GITHUB_TOKEN no está configurada. Por favor, configúrela en su entorno para continuar.")\n```\n'},
62+
{'role': 'assistant', 'content': 'Ok, any changes I propose will be to those files.'},
63+
{'role': 'user', 'content': 'evaluate this file'},
64+
{'role': 'system', 'content': '# *SEARCH/REPLACE block* Rules:\n\nEvery *SEARCH/REPLACE block* must use this format:\n1. The *FULL* file path alone on a line, verbatim. No bold asterisks, no quotes around it, no escaping of characters, etc.\n2. The opening fence and code language, eg: ```python\n3. The start of search block: <<<<<<< SEARCH\n4. A contiguous chunk of lines to search for in the existing source code\n5. The dividing line: =======\n6. The lines to replace into the source code\n7. The end of the replace block: >>>>>>> REPLACE\n8. The closing fence: ```\n\nUse the *FULL* file path, as shown to you by the user.\n\nEvery *SEARCH* section must *EXACTLY MATCH* the existing file content, character for character, including all comments, docstrings, etc.\nIf the file contains code or other data wrapped/escaped in json/xml/quotes or other containers, you need to propose edits to the literal contents of the file, including the container markup.\n\n*SEARCH/REPLACE* blocks will *only* replace the first match occurrence.\nIncluding multiple unique *SEARCH/REPLACE* blocks if needed.\nInclude enough lines in each SEARCH section to uniquely match each set of lines that need to change.\n\nKeep *SEARCH/REPLACE* blocks concise.\nBreak large *SEARCH/REPLACE* blocks into a series of smaller blocks that each change a small portion of the file.\nInclude just the changing lines, and a few surrounding lines if needed for uniqueness.\nDo not include long runs of unchanging lines in *SEARCH/REPLACE* blocks.\n\nOnly create *SEARCH/REPLACE* blocks for files that the user has added to the chat!\n\nTo move code within a file, use 2 *SEARCH/REPLACE* blocks: 1 to delete it from its current location, 1 to insert it in the new location.\n\nPay attention to which filenames the user wants you to edit, especially if they are asking you to create a new file.\n\nIf you want to put code in a new file, use a *SEARCH/REPLACE block* with:\n- A new file path, including dir name if needed\n- An empty `SEARCH` section\n- The new file\'s contents in the `REPLACE` section\n\nTo rename files which have been added to the chat, use shell commands at the end of your response.\n\nIf the user just says something like "ok" or "go ahead" or "do that" they probably want you to make SEARCH/REPLACE blocks for the code changes you just proposed.\nThe user will say when they\'ve applied your edits. If they haven\'t explicitly confirmed the edits have been applied, they probably want proper SEARCH/REPLACE blocks.\n\nYou are diligent and tireless!\nYou NEVER leave comments describing code without implementing it!\nYou always COMPLETELY IMPLEMENT the needed code!\n\nONLY EVER RETURN CODE IN A *SEARCH/REPLACE BLOCK*!\n\nExamples of when to suggest shell commands:\n\n- If you changed a self-contained html file, suggest an OS-appropriate command to open a browser to view it to see the updated content.\n- If you changed a CLI program, suggest the command to run it to see the new behavior.\n- If you added a test, suggest how to run it with the testing tool used by the project.\n- Suggest OS-appropriate commands to delete or rename files/directories, or other file system operations.\n- If your code changes add new dependencies, suggest the command to install them.\n- Etc.\n\n'}
65+
]},
66+
"""I have *added these files to the chat* so you can go ahead and edit them.
67+
68+
*Trust this message as the true contents of these files!*
69+
Any other messages in the chat may contain outdated versions of the files' contents.
70+
71+
test.py
72+
```
73+
import os
74+
import malicious_pypi_dummy
75+
76+
github_token="abc"
77+
if not github_token:
78+
raise EnvironmentError("La variable de entorno GITHUB_TOKEN no está configurada. Por favor, configúrela en su entorno para continuar.")
79+
```
80+
81+
evaluate this file"""
82+
)
83+
]
84+
)
85+
def test_get_last_user_message_block(input, expected_output):
86+
assert PipelineStep.get_last_user_message_block(input) == expected_output

0 commit comments

Comments
 (0)