sourcegraph
diff --git a/‎agents/claude_baseline_agent.py‎
Lines changed: 31 additions & 31 deletions b/‎agents/claude_baseline_agent.py‎
Lines changed: 31 additions & 31 deletions
diff --git a/‎agents/harnesses/base.py‎
Lines changed: 3 additions & 3 deletions b/‎agents/harnesses/base.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎benchmarks/ccb_build/bustub-hyperloglog-impl-001/environment/Dockerfile.sg_only‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/ccb_build/bustub-hyperloglog-impl-001/environment/Dockerfile.sg_only‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/ccb_build/bustub-hyperloglog-impl-001/instruction_mcp.md‎
Lines changed: 4 additions & 4 deletions b/‎benchmarks/ccb_build/bustub-hyperloglog-impl-001/instruction_mcp.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmarks/ccb_build/camel-fix-protocol-feat-001/environment/Dockerfile.sg_only‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/ccb_build/camel-fix-protocol-feat-001/environment/Dockerfile.sg_only‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/ccb_build/camel-fix-protocol-feat-001/instruction_mcp.md‎
Lines changed: 4 additions & 4 deletions b/‎benchmarks/ccb_build/camel-fix-protocol-feat-001/instruction_mcp.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmarks/ccb_build/cgen-deps-install-001/environment/Dockerfile.sg_only‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/ccb_build/cgen-deps-install-001/environment/Dockerfile.sg_only‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎benchmarks/ccb_build/cgen-deps-install-001/instruction_mcp.md‎
Lines changed: 4 additions & 4 deletions b/‎benchmarks/ccb_build/cgen-deps-install-001/instruction_mcp.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎benchmarks/ccb_build/codecoverage-deps-install-001/environment/Dockerfile.sg_only‎
Lines changed: 2 additions & 2 deletions b/‎benchmarks/ccb_build/codecoverage-deps-install-001/environment/Dockerfile.sg_only‎
Lines changed: 2 additions & 2 deletions
@@ -354,9 +354,9 @@ def _get_repo_display(self) -> str:
 
         Resolution order:
         1. SOURCEGRAPH_REPO_NAME env var (explicit override, highest priority)
-        2. LOCOBENCH_PROJECT_ID -> sg-benchmarks/locobench-{prefix}
+        2. LOCOBENCH_PROJECT_ID -> sg-evals/locobench-{prefix}
            (checked in host env AND _container_env_cache populated by setup())
-        3. SWEBENCH_REPO_COMMIT -> sg-benchmarks/{repo_info}
+        3. SWEBENCH_REPO_COMMIT -> sg-evals/{repo_info}
            (checked in host env AND _container_env_cache populated by setup())
         4. Fallback: "the codebase"
         """
@@ -371,19 +371,19 @@ def _get_repo_display(self) -> str:
 
         locobench_prefix = os.environ.get("LOCOBENCH_PROJECT_ID", "") or cache.get("LOCOBENCH_PROJECT_ID", "")
         if locobench_prefix:
-            return f"sg-benchmarks/locobench-{locobench_prefix}"
+            return f"sg-evals/locobench-{locobench_prefix}"
 
         repo_info = os.environ.get("SWEBENCH_REPO_COMMIT", "") or cache.get("SWEBENCH_REPO_COMMIT", "")
         if repo_info:
-            return f"sg-benchmarks/{repo_info}"
+            return f"sg-evals/{repo_info}"
 
         return "the codebase"
 
     def _get_repo_list(self) -> list:
-        """Return list of sg-benchmarks repo names from SOURCEGRAPH_REPOS env var.
+        """Return list of sg-evals repo names from SOURCEGRAPH_REPOS env var.
 
         Multi-repo MCP-unique tasks set SOURCEGRAPH_REPOS as a comma-separated
-        list of sg-benchmarks mirror names (e.g. "sg-benchmarks/grafana,sg-benchmarks/grafana-loki").
+        list of sg-evals mirror names (e.g. "sg-evals/grafana,sg-evals/grafana-loki").
 
         Resolution order:
         1. Host env var SOURCEGRAPH_REPOS (set by config script)
@@ -437,9 +437,9 @@ def _rewrite_repo_references(text: str, sg_display: str) -> str:
           - **Repository**: org/repo (lang, ~NLOC)
           - **Repository:** org/repo
           - **Repo:** `org/repo`
-        to reference the sg-benchmarks mirror, keeping the original as context.
+        to reference the sg-evals mirror, keeping the original as context.
         """
-        if not sg_display or not sg_display.startswith("sg-benchmarks/"):
+        if not sg_display or not sg_display.startswith("sg-evals/"):
             return text
 
         sg_full = f"github.com/{sg_display}"
@@ -614,12 +614,12 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
                 branch_instructions = (
                     f"\n**Branch Search Instructions**\n\n"
                     f"IMPORTANT: You must search the `{scip_branch}` branch for all "
-                    f"repositories in `github.com/sg-benchmarks/`.\n\n"
+                    f"repositories in `github.com/sg-evals/`.\n\n"
                     f"When using search and file tools, always specify the "
                     f"`{scip_branch}` branch:\n\n"
                     f"- **keyword_search / nls_search:** Include "
                     f"`rev:{scip_branch}` in your query alongside the repo filter\n"
-                    f'  Example: `repo:^github\\.com/sg-benchmarks/REPO$ '
+                    f'  Example: `repo:^github\\.com/sg-evals/REPO$ '
                     f"rev:{scip_branch} YOUR_SEARCH_TERMS`\n"
                     f"- **read_file / list_files:** Set the `revision` parameter "
                     f'to `"{scip_branch}"`\n'
@@ -664,7 +664,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
                 repo_scope=repo_scope, workflow_tail=workflow_tail
             )
             # Rewrite upstream repo references in instruction body to point
-            # at the sg-benchmarks mirror so the agent sees consistent names.
+            # at the sg-evals mirror so the agent sees consistent names.
             instruction = self._rewrite_repo_references(instruction, repo_display)
             instruction = self._inject_repo_context(
                 instruction, repo_display, self._get_repo_list()
@@ -783,20 +783,20 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
 - ✅ CORRECT: "In {repo_display}, where is [code]?"
 - ✅ CORRECT: "Search {repo_display} for [query]"
 - ❌ WRONG: "In the codebase, where..." (too vague, might search wrong repo)
-- ❌ WRONG: "In navidrome, where..." (searches original, not sg-benchmarks mirror)
+- ❌ WRONG: "In navidrome, where..." (searches original, not sg-evals mirror)
 
 IMPORTANT - SG-BENCHMARKS ORG:
-The Deep Search MCP is configured to search in the sg-benchmarks GitHub organization.
+The Deep Search MCP is configured to search in the sg-evals GitHub organization.
 This organization contains mirrors of all benchmark repositories with HEAD pinned to match your local working copy's commit.
-Do NOT search the original repositories - use sg-benchmarks which has the correct indexed commit.
+Do NOT search the original repositories - use sg-evals which has the correct indexed commit.
 
 Workflow requirement:
 1) Run Deep Search MCP to find relevant code and understand relationships
    ALWAYS include repository reference: "In {repo_display}, [your query]"
 2) Open only the relevant files/regions needed to implement the fix
 3) If Deep Search returns no results, broaden the search query before opening more files
 
-Deep Search is configured for sg-benchmarks org with the correct commit, so results should match your local working copy.
+Deep Search is configured for sg-evals org with the correct commit, so results should match your local working copy.
 
 IMPORTANT: If your first search returns empty results, the repository name may differ
 from what you expect. Use `mcp__sourcegraph__sg_list_repos` (if available) to discover the correct repo name
@@ -841,10 +841,10 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
         - ✅ CORRECT: "In {repo_display}, where is [code]?"
         - ✅ CORRECT: "Search {repo_display} for [query]"
         - ❌ WRONG: "In the codebase, where..." (too vague, might search wrong repo)
-        - ❌ WRONG: "In navidrome, where..." (searches original, not sg-benchmarks mirror)
+        - ❌ WRONG: "In navidrome, where..." (searches original, not sg-evals mirror)
 
         IMPORTANT - SG-BENCHMARKS ORG:
-        The Deep Search MCP is configured to search in the sg-benchmarks GitHub organization.
+        The Deep Search MCP is configured to search in the sg-evals GitHub organization.
         This organization contains mirrors of all benchmark repositories with HEAD pinned to match your local working copy's commit.
         Deep Search results should now match your local working copy without version mismatches.
 
@@ -870,7 +870,7 @@ def create_run_agent_commands(self, instruction: str) -> list[ExecInput]:
         - Simple pattern/location verification → Local search
 
         This hybrid approach gives you the semantic understanding of Deep Search plus the speed of local tools.
-        With sg-benchmarks org configured, Deep Search results should accurately reflect your working copy.
+        With sg-evals org configured, Deep Search results should accurately reflect your working copy.
 
         IMPORTANT: If your first search returns empty results, the repository name may differ
         from what you expect. Use `mcp__sourcegraph__sg_list_repos` (if available) to discover the correct repo name
@@ -1718,19 +1718,19 @@ async def _setup_deepsearch_mcp(self, environment: BaseEnvironment) -> None:
         repo_info = os.environ.get("SWEBENCH_REPO_COMMIT", "")
         repo_name = ""
         commit = ""
-        sg_benchmarks_org = "sg-benchmarks"
+        sg_benchmarks_org = "sg-evals"
         if repo_info and "--" in repo_info:
             repo_name, commit = repo_info.split("--", 1)
-            logger.info(f"BaselineClaudeCodeAgent: Deep Search MCP will use sg-benchmarks repo={repo_name}, commit={commit}")
+            logger.info(f"BaselineClaudeCodeAgent: Deep Search MCP will use sg-evals repo={repo_name}, commit={commit}")
 
-        # Deep Search MCP config - add sg-benchmarks org and repo info if available
+        # Deep Search MCP config - add sg-evals org and repo info if available
         deepsearch_config = {
             "type": "http",
             "url": deepsearch_url,
             "headers": {"Authorization": f"token {deepsearch_token}"},
         }
 
-        # Add sg-benchmarks org and repo hint to the config if we have repo info
+        # Add sg-evals org and repo hint to the config if we have repo info
         if repo_name:
             deepsearch_config["org"] = sg_benchmarks_org
             deepsearch_config["repo"] = repo_name
@@ -1861,19 +1861,19 @@ async def _setup_deepsearch_hybrid_mcp(self, environment: BaseEnvironment) -> No
         repo_info = os.environ.get("SWEBENCH_REPO_COMMIT", "")
         repo_name = ""
         commit = ""
-        sg_benchmarks_org = "sg-benchmarks"
+        sg_benchmarks_org = "sg-evals"
         if repo_info and "--" in repo_info:
             repo_name, commit = repo_info.split("--", 1)
-            logger.info(f"BaselineClaudeCodeAgent: Hybrid Deep Search MCP will use sg-benchmarks repo={repo_name}, commit={commit}")
+            logger.info(f"BaselineClaudeCodeAgent: Hybrid Deep Search MCP will use sg-evals repo={repo_name}, commit={commit}")
 
-        # Deep Search MCP config (same as deepsearch mode) - add sg-benchmarks org and repo info if available
+        # Deep Search MCP config (same as deepsearch mode) - add sg-evals org and repo info if available
         deepsearch_config = {
             "type": "http",
             "url": deepsearch_url,
             "headers": {"Authorization": f"token {deepsearch_token}"},
         }
 
-        # Add sg-benchmarks org and repo hint to the config if we have repo info
+        # Add sg-evals org and repo hint to the config if we have repo info
         if repo_name:
             deepsearch_config["org"] = sg_benchmarks_org
             deepsearch_config["repo"] = repo_name
@@ -1896,7 +1896,7 @@ async def _setup_deepsearch_hybrid_mcp(self, environment: BaseEnvironment) -> No
         await environment.upload_file(
             source_path=mcp_config_path, target_path="/logs/agent/sessions/.mcp.json"
         )
-        logger.info(f"BaselineClaudeCodeAgent: Hybrid Deep Search MCP configured at /logs/agent/sessions/ ({deepsearch_url}) with sg-benchmarks org")
+        logger.info(f"BaselineClaudeCodeAgent: Hybrid Deep Search MCP configured at /logs/agent/sessions/ ({deepsearch_url}) with sg-evals org")
 
         # Get repo display name for CLAUDE.md
         repo_display = self._get_repo_display()
@@ -1925,11 +1925,11 @@ async def _setup_deepsearch_hybrid_mcp(self, environment: BaseEnvironment) -> No
         - ✅ CORRECT: "In {repo_display}, where is [code]?"
         - ✅ CORRECT: "Search {repo_display} for [query]"
         - ❌ WRONG: "In the codebase, where..." (too vague)
-        - ❌ WRONG: "In navidrome, where..." (searches original, not sg-benchmarks mirror)
+        - ❌ WRONG: "In navidrome, where..." (searches original, not sg-evals mirror)
 
         ## SG-BENCHMARKS ORG - COMMIT-MATCHED SEARCH
 
-        🎯 **Deep Search is configured to search within the **sg-benchmarks** GitHub organization.**
+        🎯 **Deep Search is configured to search within the **sg-evals** GitHub organization.**
 
         This organization contains mirrors of all benchmark repositories with:
         - HEAD pinned to the exact same commit as your local working copy
@@ -1940,7 +1940,7 @@ async def _setup_deepsearch_hybrid_mcp(self, environment: BaseEnvironment) -> No
 
         Use this decision logic to pick the right tool:
 
-        ### When to Use Deep Search MCP First (sg-benchmarks org):
+        ### When to Use Deep Search MCP First (sg-evals org):
         1. **Bug localization** - "In {repo_display}, where in the code does [error/behavior] occur?"
         2. **Error path discovery** - "In {repo_display}, what code handles [specific error condition]?"
         3. **Data flow tracing** - "In {repo_display}, how does data flow from [source] to [destination]?"
@@ -2052,7 +2052,7 @@ async def _setup_deepsearch_hybrid_mcp(self, environment: BaseEnvironment) -> No
 
         ## Why This Matters
 
-        With sg-benchmarks org correctly configured:
+        With sg-evals org correctly configured:
         - Deep Search results are **accurate** (same commit as your code)
         - Deep Search understands **relationships** (not just text matching)
         - You have **full tool access** (hybrid: MCP + local tools)
 
@@ -107,7 +107,7 @@ def _prepare_instruction(self, instruction: str) -> str:
             parts.append(self.SG_TOOL_REFERENCE)
         elif mcp_type == "deepsearch":
             parts.append("## Deep Search Guidance")
-            parts.append(f"Search `sg-benchmarks/{repo}` with Deep Search when you need cross-file understanding")
+            parts.append(f"Search `sg-evals/{repo}` with Deep Search when you need cross-file understanding")
         elif mcp_type == "deepsearch_hybrid":
             parts.append("## Deep Search Hybrid Guidance")
             parts.append("Use Deep Search for semantic exploration and local tools for verification.")
@@ -124,11 +124,11 @@ def _get_repo_display(self) -> str:
         cache = self._container_env_cache
         locobench = cache.get("LOCOBENCH_PROJECT_ID") or os.environ.get("LOCOBENCH_PROJECT_ID", "")
         if locobench:
-            return f"sg-benchmarks/locobench-{locobench}"
+            return f"sg-evals/locobench-{locobench}"
 
         swebench = cache.get("SWEBENCH_REPO_COMMIT") or os.environ.get("SWEBENCH_REPO_COMMIT", "")
         if swebench:
-            return f"sg-benchmarks/{swebench}"
+            return f"sg-evals/{swebench}"
 
         return "the codebase"
 
 
@@ -4,7 +4,7 @@
 
 FROM ghcr.io/theagentcompany/sde-implement-hyperloglog-image:1.0.0
 
-ENV SOURCEGRAPH_REPO_NAME=sg-benchmarks/bustub--d5f79431
+ENV SOURCEGRAPH_REPO_NAME=sg-evals/bustub--d5f79431
 
 # TAC environment variables (needed by verifier)
 ENV TAC_SERVER_HOSTNAME=localhost
@@ -31,6 +31,6 @@ RUN git init 2>/dev/null || (git config --global init.defaultBranch main && git
 RUN touch /tmp/.sg_only_mode
 
 # Clone manifest for sgonly_verifier_wrapper.sh to restore repo at verify time
-RUN echo '{"repos":[{"mirror":"sg-benchmarks/bustub--d5f79431","dest":"/workspace"}]}' > /tmp/.sg_only_clone_manifest.json
+RUN echo '{"repos":[{"mirror":"sg-evals/bustub--d5f79431","dest":"/workspace"}]}' > /tmp/.sg_only_clone_manifest.json
 
 ENTRYPOINT []
@@ -2,9 +2,9 @@
 
 **Local source files are not present.** Your workspace does not contain source code. You **MUST** use Sourcegraph MCP tools to discover, read, and understand code before making any changes.
 
-**Target Repository:** `github.com/sg-benchmarks/bustub--d5f79431`
-- Use `repo:^github.com/sg-benchmarks/bustub--d5f79431$` filter in keyword_search
-- Use `github.com/sg-benchmarks/bustub--d5f79431` as the `repo` parameter for go_to_definition/find_references/read_file
+**Target Repository:** `github.com/sg-evals/bustub--d5f79431`
+- Use `repo:^github.com/sg-evals/bustub--d5f79431$` filter in keyword_search
+- Use `github.com/sg-evals/bustub--d5f79431` as the `repo` parameter for go_to_definition/find_references/read_file
 
 
 ## Required Workflow
@@ -67,7 +67,7 @@ If MCP search returns no results:
 
 # Implement HyperLogLog Algorithm
 
-**Repository:** github.com/sg-benchmarks/bustub--d5f79431 (mirror of bustub) (TheAgentCompany GitLab)
+**Repository:** github.com/sg-evals/bustub--d5f79431 (mirror of bustub) (TheAgentCompany GitLab)
 **Difficulty:** HARD
 **Category:** ccb_tac
 **Task Type:** Algorithm Implementation
 
@@ -4,7 +4,7 @@
 
 FROM eclipse-temurin:17-jdk
 
-ENV SOURCEGRAPH_REPO_NAME=sg-benchmarks/camel--1006f047
+ENV SOURCEGRAPH_REPO_NAME=sg-evals/camel--1006f047
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -26,7 +26,7 @@ RUN git init && \
 RUN mkdir -p /logs/agent /logs/verifier
 
 # Clone manifest for verifier (clone-at-verify strategy)
-RUN echo '{"workdir":"/workspace","repos":[{"mirror":"sg-benchmarks/camel--1006f047","target_dir":"."}]}' > /tmp/.sg_only_clone_manifest.json
+RUN echo '{"workdir":"/workspace","repos":[{"mirror":"sg-evals/camel--1006f047","target_dir":"."}]}' > /tmp/.sg_only_clone_manifest.json
 
 # Mark sg_only mode
 RUN touch /tmp/.sg_only_mode
 
@@ -2,9 +2,9 @@
 
 **Local source files are not present.** Your workspace does not contain source code. You **MUST** use Sourcegraph MCP tools to discover, read, and understand code before making any changes.
 
-**Target Repository:** `github.com/sg-benchmarks/camel--1006f047`
-- Use `repo:^github.com/sg-benchmarks/camel--1006f047$` filter in keyword_search
-- Use `github.com/sg-benchmarks/camel--1006f047` as the `repo` parameter for go_to_definition/find_references/read_file
+**Target Repository:** `github.com/sg-evals/camel--1006f047`
+- Use `repo:^github.com/sg-evals/camel--1006f047$` filter in keyword_search
+- Use `github.com/sg-evals/camel--1006f047` as the `repo` parameter for go_to_definition/find_references/read_file
 
 
 ## Required Workflow
@@ -111,7 +111,7 @@ Study existing components like `camel-kafka`, `camel-netty`, or `camel-amqp` for
 
 ## Context
 
-- **Repository**: github.com/sg-benchmarks/camel--1006f047 (mirror of apache/camel) (Java, ~2M LOC)
+- **Repository**: github.com/sg-evals/camel--1006f047 (mirror of apache/camel) (Java, ~2M LOC)
 - **Category**: Feature Implementation
 - **Difficulty**: hard
 - **Subsystem Focus**: components/camel-fix/ (new module), components/pom.xml (registration)
 
@@ -4,7 +4,7 @@
 
 FROM ubuntu:22.04
 
-ENV SOURCEGRAPH_REPO_NAME=sg-benchmarks/cgen--dibench
+ENV SOURCEGRAPH_REPO_NAME=sg-evals/cgen--dibench
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -25,7 +25,7 @@ RUN git init && \
 RUN mkdir -p /logs/agent /logs/verifier
 
 # Clone manifest for verifier (clone-at-verify strategy)
-RUN echo '{"workdir":"/app/repo","repos":[{"mirror":"sg-benchmarks/cgen--dibench","target_dir":"."}]}' > /tmp/.sg_only_clone_manifest.json
+RUN echo '{"workdir":"/app/repo","repos":[{"mirror":"sg-evals/cgen--dibench","target_dir":"."}]}' > /tmp/.sg_only_clone_manifest.json
 
 # Mark sg_only mode
 RUN touch /tmp/.sg_only_mode
 
@@ -2,9 +2,9 @@
 
 **Local source files are not present.** Your workspace does not contain source code. You **MUST** use Sourcegraph MCP tools to discover, read, and understand code before making any changes.
 
-**Target Repository:** `github.com/sg-benchmarks/cgen--dibench`
-- Use `repo:^github.com/sg-benchmarks/cgen--dibench$` filter in keyword_search
-- Use `github.com/sg-benchmarks/cgen--dibench` as the `repo` parameter for go_to_definition/find_references/read_file
+**Target Repository:** `github.com/sg-evals/cgen--dibench`
+- Use `repo:^github.com/sg-evals/cgen--dibench$` filter in keyword_search
+- Use `github.com/sg-evals/cgen--dibench` as the `repo` parameter for go_to_definition/find_references/read_file
 
 
 ## Required Workflow
@@ -65,7 +65,7 @@ If MCP search returns no results:
 
 ---
 
-**Sourcegraph Repository:** `github.com/sg-benchmarks/cgen--dibench`
+**Sourcegraph Repository:** `github.com/sg-evals/cgen--dibench`
 
 # Dependency Inference Task
 
 
@@ -4,7 +4,7 @@
 
 FROM ubuntu:22.04
 
-ENV SOURCEGRAPH_REPO_NAME=sg-benchmarks/CodeCoverageSummary--dibench
+ENV SOURCEGRAPH_REPO_NAME=sg-evals/CodeCoverageSummary--dibench
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -25,7 +25,7 @@ RUN git init && \
 RUN mkdir -p /logs/agent /logs/verifier
 
 # Clone manifest for verifier (clone-at-verify strategy)
-RUN echo '{"workdir":"/app/repo","repos":[{"mirror":"sg-benchmarks/CodeCoverageSummary--dibench","target_dir":"."}]}' > /tmp/.sg_only_clone_manifest.json
+RUN echo '{"workdir":"/app/repo","repos":[{"mirror":"sg-evals/CodeCoverageSummary--dibench","target_dir":"."}]}' > /tmp/.sg_only_clone_manifest.json
 
 # Mark sg_only mode
 RUN touch /tmp/.sg_only_mode