Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

feat: remove duplicated alerts #989

Merged
merged 3 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion signatures.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,6 @@
- Bitcoin SegWit: \b(bc1)[a-zA-HJ-NP-Z0-9]{39,59}\b
- Ethereum: \b0x[a-fA-F0-9]{40}\b
- Litecoin: \b(L|M)[a-km-zA-HJ-NP-Z1-9]{26,33}\b
- Dogecoin: \b(D|A)[a-km-zA-HJ-NP-Z1-9]{25,34}\b
- Ripple: \br[rK][a-zA-Z0-9]{25,35}\b
- Monero: \b4[0-9AB][1-9A-HJ-NP-Za-km-z]{93}\b
- Tron: \bT[a-zA-HJ-NP-Z0-9]{33}\b
Expand Down
44 changes: 40 additions & 4 deletions src/codegate/api/v1_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ async def _is_system_prompt(message: str) -> bool:
return False


async def parse_request(request_str: str) -> Tuple[Optional[List[str]], str]:
async def parse_request(request_str: str) -> Tuple[Optional[List[str]], str]: # noqa: C901
"""
Parse the request string from the pipeline and return the message and the model.
"""
Expand Down Expand Up @@ -105,7 +105,7 @@ async def parse_request(request_str: str) -> Tuple[Optional[List[str]], str]:
return messages, model


async def parse_output(output_str: str) -> Optional[str]:
async def parse_output(output_str: str) -> Optional[str]: # noqa: C901
"""
Parse the output string from the pipeline and return the message.
"""
Expand Down Expand Up @@ -392,7 +392,8 @@ async def match_conversations(
qa = _get_question_answer_from_partial(selected_partial_qa)
qa.question.message = parse_question_answer(qa.question.message)
questions_answers.append(qa)
alerts.extend(selected_partial_qa.alerts)
deduped_alerts = await remove_duplicate_alerts(selected_partial_qa.alerts)
alerts.extend(deduped_alerts)
token_usage_agg.add_model_token_usage(selected_partial_qa.model_token_usage)

# only add conversation if we have some answers
Expand Down Expand Up @@ -480,10 +481,11 @@ async def parse_get_alert_conversation(
The rows contain the raw request and output strings from the pipeline.
"""
_, map_q_id_to_conversation = await parse_messages_in_conversations(prompts_outputs)
dedup_alerts = await remove_duplicate_alerts(alerts)
async with asyncio.TaskGroup() as tg:
tasks = [
tg.create_task(parse_row_alert_conversation(row, map_q_id_to_conversation))
for row in alerts
for row in dedup_alerts
]
return [task.result() for task in tasks if task.result() is not None]

Expand All @@ -499,3 +501,37 @@ async def parse_workspace_token_usage(
for p_qa in partial_question_answers:
token_usage_agg.add_model_token_usage(p_qa.model_token_usage)
return token_usage_agg


async def remove_duplicate_alerts(alerts: List[v1_models.Alert]) -> List[v1_models.Alert]:
unique_alerts = []
seen = defaultdict(list)

for alert in sorted(
alerts, key=lambda x: x.timestamp, reverse=True
): # Sort alerts by timestamp descending
if alert.trigger_type != "codegate-secrets":
unique_alerts.append(alert)
continue

# Extract trigger string content until "Context"
trigger_string_content = alert.trigger_string.split("Context")[0]

key = (
alert.code_snippet,
alert.trigger_type,
alert.trigger_category,
trigger_string_content,
)

# If key exists and new alert is more recent, replace it
if key in seen:
existing_alert = seen[key]
if abs((alert.timestamp - existing_alert.timestamp).total_seconds()) < 5:
seen[key] = alert # Replace with newer alert
continue

seen[key] = alert
unique_alerts.append(alert)

return list(seen.values())