|
18 | 18 | import json |
19 | 19 | import logging |
20 | 20 | import os |
| 21 | +import re |
21 | 22 | from pathlib import Path |
22 | 23 |
|
23 | 24 | from harbor.agents.installed.claude_code import ClaudeCode |
@@ -428,6 +429,74 @@ def _parse_sourcegraph_repos_from_dockerfile(self) -> str: |
428 | 429 | continue |
429 | 430 | return "" |
430 | 431 |
|
| 432 | + @staticmethod |
| 433 | + def _rewrite_repo_references(text: str, sg_display: str) -> str: |
| 434 | + """Replace upstream repo references in instruction body with SG mirror name. |
| 435 | +
|
| 436 | + Rewrites patterns like: |
| 437 | + - **Repository**: org/repo (lang, ~NLOC) |
| 438 | + - **Repository:** org/repo |
| 439 | + - **Repo:** `org/repo` |
| 440 | + to reference the sg-benchmarks mirror, keeping the original as context. |
| 441 | + """ |
| 442 | + if not sg_display or not sg_display.startswith("sg-benchmarks/"): |
| 443 | + return text |
| 444 | + |
| 445 | + sg_full = f"github.com/{sg_display}" |
| 446 | + |
| 447 | + def _replace_repository(m: re.Match) -> str: |
| 448 | + prefix = m.group(1) |
| 449 | + old_slug = m.group(2) |
| 450 | + suffix = m.group(3) |
| 451 | + return f"{prefix}{sg_full} (mirror of {old_slug}){suffix}" |
| 452 | + |
| 453 | + text = re.sub( |
| 454 | + r'(\*\*Repository\*?\*?:?\*?\*?\s*)' |
| 455 | + r'([\w.-]+(?:/[\w.-]+)?)' |
| 456 | + r'((?:\s+\([^)]*\))?)', |
| 457 | + _replace_repository, |
| 458 | + text, |
| 459 | + ) |
| 460 | + |
| 461 | + def _replace_repo_backtick(m: re.Match) -> str: |
| 462 | + prefix = m.group(1) |
| 463 | + old_slug = m.group(2) |
| 464 | + return f"{prefix}`{sg_full}` (mirror of `{old_slug}`)" |
| 465 | + |
| 466 | + text = re.sub( |
| 467 | + r'(\*\*Repo:\*\*\s*)`([\w.-]+(?:/[\w.-]+)?)`', |
| 468 | + _replace_repo_backtick, |
| 469 | + text, |
| 470 | + ) |
| 471 | + |
| 472 | + return text |
| 473 | + |
| 474 | + @staticmethod |
| 475 | + def _inject_repo_context(text: str, sg_display: str, repo_list: list) -> str: |
| 476 | + """Inject a Sourcegraph repo context line if the body lacks a **Repository** line. |
| 477 | +
|
| 478 | + Some tasks (~57) have no structured **Repository:** line, so the agent |
| 479 | + only sees the mirror name in the preamble header. This adds a visible |
| 480 | + context line at the top of the instruction body. |
| 481 | + """ |
| 482 | + if re.search(r'\*\*Repo(sitory)?', text): |
| 483 | + return text |
| 484 | + |
| 485 | + if repo_list: |
| 486 | + sg_names = [] |
| 487 | + for r in repo_list: |
| 488 | + d = r[len("github.com/"):] if r.startswith("github.com/") else r |
| 489 | + sg_names.append(f"`github.com/{d}`") |
| 490 | + context = f"**Sourcegraph Repositories:** {', '.join(sg_names)}\n\n" |
| 491 | + elif sg_display and sg_display != "the codebase": |
| 492 | + display = sg_display |
| 493 | + if display.startswith("github.com/"): |
| 494 | + display = display[len("github.com/"):] |
| 495 | + context = f"**Sourcegraph Repository:** `github.com/{display}`\n\n" |
| 496 | + else: |
| 497 | + return text |
| 498 | + |
| 499 | + return context + text |
431 | 500 |
|
432 | 501 | def _get_session_dir(self): |
433 | 502 | """Override Harbor's _get_session_dir to handle Claude Code subagent sessions. |
@@ -571,6 +640,12 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]: |
571 | 640 | mcp_preamble = V5_PREAMBLE_TEMPLATE.format( |
572 | 641 | repo_scope=repo_scope, workflow_tail=workflow_tail |
573 | 642 | ) |
| 643 | + # Rewrite upstream repo references in instruction body to point |
| 644 | + # at the sg-benchmarks mirror so the agent sees consistent names. |
| 645 | + instruction = self._rewrite_repo_references(instruction, repo_display) |
| 646 | + instruction = self._inject_repo_context( |
| 647 | + instruction, repo_display, self._get_repo_list() |
| 648 | + ) |
574 | 649 | instruction = mcp_preamble + instruction |
575 | 650 |
|
576 | 651 | elif mcp_type == "sourcegraph_isolated": |
|
0 commit comments