Skip to content

Commit 0073dc9

Browse files
sjarmakclaude
andcommitted
fix: rewrite upstream repo references in MCP instruction body at runtime
The V5 preamble correctly references sg-benchmarks mirrors, but the instruction body still contained upstream repo names (e.g., apache/camel, django/django), creating conflicting signals for the agent. Port the rewriting logic from generate_instruction_mcp.py into claude_baseline_agent.py so it runs at runtime: - _rewrite_repo_references(): rewrites **Repository:** and **Repo:** patterns to point at the sg-benchmarks mirror with "(mirror of ...)" annotation - _inject_repo_context(): adds **Sourcegraph Repository:** header for the ~57 tasks that lack any Repository line in their instruction Both methods are no-ops when sg_display is empty or non-sg-benchmarks, preserving baseline behavior. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 0843578 commit 0073dc9

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed

agents/claude_baseline_agent.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import json
1919
import logging
2020
import os
21+
import re
2122
from pathlib import Path
2223

2324
from harbor.agents.installed.claude_code import ClaudeCode
@@ -428,6 +429,74 @@ def _parse_sourcegraph_repos_from_dockerfile(self) -> str:
428429
continue
429430
return ""
430431

432+
@staticmethod
433+
def _rewrite_repo_references(text: str, sg_display: str) -> str:
434+
"""Replace upstream repo references in instruction body with SG mirror name.
435+
436+
Rewrites patterns like:
437+
- **Repository**: org/repo (lang, ~NLOC)
438+
- **Repository:** org/repo
439+
- **Repo:** `org/repo`
440+
to reference the sg-benchmarks mirror, keeping the original as context.
441+
"""
442+
if not sg_display or not sg_display.startswith("sg-benchmarks/"):
443+
return text
444+
445+
sg_full = f"github.com/{sg_display}"
446+
447+
def _replace_repository(m: re.Match) -> str:
448+
prefix = m.group(1)
449+
old_slug = m.group(2)
450+
suffix = m.group(3)
451+
return f"{prefix}{sg_full} (mirror of {old_slug}){suffix}"
452+
453+
text = re.sub(
454+
r'(\*\*Repository\*?\*?:?\*?\*?\s*)'
455+
r'([\w.-]+(?:/[\w.-]+)?)'
456+
r'((?:\s+\([^)]*\))?)',
457+
_replace_repository,
458+
text,
459+
)
460+
461+
def _replace_repo_backtick(m: re.Match) -> str:
462+
prefix = m.group(1)
463+
old_slug = m.group(2)
464+
return f"{prefix}`{sg_full}` (mirror of `{old_slug}`)"
465+
466+
text = re.sub(
467+
r'(\*\*Repo:\*\*\s*)`([\w.-]+(?:/[\w.-]+)?)`',
468+
_replace_repo_backtick,
469+
text,
470+
)
471+
472+
return text
473+
474+
@staticmethod
475+
def _inject_repo_context(text: str, sg_display: str, repo_list: list) -> str:
476+
"""Inject a Sourcegraph repo context line if the body lacks a **Repository** line.
477+
478+
Some tasks (~57) have no structured **Repository:** line, so the agent
479+
only sees the mirror name in the preamble header. This adds a visible
480+
context line at the top of the instruction body.
481+
"""
482+
if re.search(r'\*\*Repo(sitory)?', text):
483+
return text
484+
485+
if repo_list:
486+
sg_names = []
487+
for r in repo_list:
488+
d = r[len("github.com/"):] if r.startswith("github.com/") else r
489+
sg_names.append(f"`github.com/{d}`")
490+
context = f"**Sourcegraph Repositories:** {', '.join(sg_names)}\n\n"
491+
elif sg_display and sg_display != "the codebase":
492+
display = sg_display
493+
if display.startswith("github.com/"):
494+
display = display[len("github.com/"):]
495+
context = f"**Sourcegraph Repository:** `github.com/{display}`\n\n"
496+
else:
497+
return text
498+
499+
return context + text
431500

432501
def _get_session_dir(self):
433502
"""Override Harbor's _get_session_dir to handle Claude Code subagent sessions.
@@ -571,6 +640,12 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
571640
mcp_preamble = V5_PREAMBLE_TEMPLATE.format(
572641
repo_scope=repo_scope, workflow_tail=workflow_tail
573642
)
643+
# Rewrite upstream repo references in instruction body to point
644+
# at the sg-benchmarks mirror so the agent sees consistent names.
645+
instruction = self._rewrite_repo_references(instruction, repo_display)
646+
instruction = self._inject_repo_context(
647+
instruction, repo_display, self._get_repo_list()
648+
)
574649
instruction = mcp_preamble + instruction
575650

576651
elif mcp_type == "sourcegraph_isolated":

0 commit comments

Comments
 (0)