Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion benchmarking/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ __pycache__/
outputs/
*.sif
*agent_systems/
agent_systems/
agent_systems/
*.pyc
1 change: 1 addition & 0 deletions benchmarking/agents/AgentSystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def get_full_prompt(self) -> str:
full_prompt += f"\n- Command: `{name}`"
full_prompt += f"\n - Description: {command.description}"
full_prompt += f"\n - Target Agent: {command.target_agent}"
full_prompt += "YOU MUST USE THESE EXACT COMMANDS TO DELEGATE TASKS. NO OTHER FORMATTING OR COMMANDS ARE ALLOWED."
Copy link

Copilot AI Jul 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line is inside the loop that appends each command, causing it to repeat multiple times. Move it outside the loop so it appears just once.

Suggested change
full_prompt += "YOU MUST USE THESE EXACT COMMANDS TO DELEGATE TASKS. NO OTHER FORMATTING OR COMMANDS ARE ALLOWED."
full_prompt += "YOU MUST USE THESE EXACT COMMANDS TO DELEGATE TASKS. NO OTHER FORMATTING OR COMMANDS ARE ALLOWED."

Copilot uses AI. Check for mistakes.
return full_prompt

class AgentSystem:
Expand Down
43 changes: 43 additions & 0 deletions benchmarking/agents/integration_system.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"agents": {
"master_agent": {
"prompt": "You are the master agent. Analyze every user request and delegate the task to the appropriate expert: the general coder for standard single-cell analysis or the integration expert for batch correction and data integration tasks. Respond ONLY with a delegation command.",
"neighbors": {
"delegate_to_general": {
"target_agent": "general_coder",
"description": "Delegate for general single-cell tasks like QC, normalization, and plotting."
},
"delegate_to_integration": {
"target_agent": "integration_expert",
"description": "Delegate for complex data integration and batch correction using scvi-tools."
}
}
},
"general_coder": {
"prompt": "You are the *general scRNA-seq coder*. You handle standard single-cell analysis tasks like data loading, QC, filtering, normalization, and basic plotting using scanpy. You are not an expert in data integration.\n\nExample of a task you would perform:\n```python\nimport scanpy as sc\n\n# Assume 'adata' is a loaded AnnData object\n# Basic QC and filtering\nsc.pp.filter_cells(adata, min_genes=200)\nsc.pp.filter_genes(adata, min_cells=3)\nadata.var['mt'] = adata.var_names.str.startswith('MT-')\nsc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)\n\n# Normalize and find highly variable genes\nsc.pp.normalize_total(adata, target_sum=1e4)\nsc.pp.log1p(adata)\nsc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)\n\n# Run PCA\nsc.tl.pca(adata, svd_solver='arpack')\n\nprint('Standard analysis complete. PCA is in adata.obsm[\"X_pca\"].')\n```",
"neighbors": {
"delegate_to_master": {
"target_agent": "master_agent",
"description": "Return to the master agent if you are not the correct expert."
},
"delegate_to_integration": {
"target_agent": "integration_expert",
"description": "Delegate to this expert for complex data integration and batch correction."
}
}
},
"integration_expert": {
"prompt": "You are the *integration expert*. You specialize in combining multiple single-cell datasets and correcting for batch effects using scvi-tools.\n\nExample of a task you would perform:\n```python\nimport scvi\nimport scanpy as sc\n\n# Assume 'adata' is loaded and preprocessed with a 'batch' column\n# Find highly variable genes across batches for integration\nsc.pp.highly_variable_genes(\n adata,\n n_top_genes=2000,\n subset=True,\n layer='counts',\n flavor='seurat_v3',\n batch_key='batch'\n)\n\n# Set up the AnnData object for the scVI model\nscvi.model.SCVI.setup_anndata(adata, layer='counts', batch_key='batch')\n\n# Create and train the scVI model\nmodel = scvi.model.SCVI(adata, n_layers=2, n_latent=30)\nmodel.train()\n\n# Store the integrated latent representation in the AnnData object\nadata.obsm['X_scVI'] = model.get_latent_representation()\n\nprint('Integration complete. Integrated embedding is in adata.obsm[\"X_scVI\"].')\n``` you remeber to wrap your code in triple backticks and python",
Copy link

Copilot AI Jul 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: remeber should be remember.

Suggested change
"prompt": "You are the *integration expert*. You specialize in combining multiple single-cell datasets and correcting for batch effects using scvi-tools.\n\nExample of a task you would perform:\n```python\nimport scvi\nimport scanpy as sc\n\n# Assume 'adata' is loaded and preprocessed with a 'batch' column\n# Find highly variable genes across batches for integration\nsc.pp.highly_variable_genes(\n adata,\n n_top_genes=2000,\n subset=True,\n layer='counts',\n flavor='seurat_v3',\n batch_key='batch'\n)\n\n# Set up the AnnData object for the scVI model\nscvi.model.SCVI.setup_anndata(adata, layer='counts', batch_key='batch')\n\n# Create and train the scVI model\nmodel = scvi.model.SCVI(adata, n_layers=2, n_latent=30)\nmodel.train()\n\n# Store the integrated latent representation in the AnnData object\nadata.obsm['X_scVI'] = model.get_latent_representation()\n\nprint('Integration complete. Integrated embedding is in adata.obsm[\"X_scVI\"].')\n``` you remeber to wrap your code in triple backticks and python",
"prompt": "You are the *integration expert*. You specialize in combining multiple single-cell datasets and correcting for batch effects using scvi-tools.\n\nExample of a task you would perform:\n```python\nimport scvi\nimport scanpy as sc\n\n# Assume 'adata' is loaded and preprocessed with a 'batch' column\n# Find highly variable genes across batches for integration\nsc.pp.highly_variable_genes(\n adata,\n n_top_genes=2000,\n subset=True,\n layer='counts',\n flavor='seurat_v3',\n batch_key='batch'\n)\n\n# Set up the AnnData object for the scVI model\nscvi.model.SCVI.setup_anndata(adata, layer='counts', batch_key='batch')\n\n# Create and train the scVI model\nmodel = scvi.model.SCVI(adata, n_layers=2, n_latent=30)\nmodel.train()\n\n# Store the integrated latent representation in the AnnData object\nadata.obsm['X_scVI'] = model.get_latent_representation()\n\nprint('Integration complete. Integrated embedding is in adata.obsm[\"X_scVI\"].')\n``` you remember to wrap your code in triple backticks and python",

Copilot uses AI. Check for mistakes.
"neighbors": {
"delegate_to_master": {
"target_agent": "master_agent",
"description": "Return to the master agent if you are not the correct expert. Only do this if you are absolutely sure you cannot handle the task. It costs money to delegate."
},
"delegate_to_general": {
"target_agent": "general_coder",
"description": "Delegate to this expert for general single-cell analysis tasks."
}
}
}
}
}
4 changes: 2 additions & 2 deletions benchmarking/auto_metrics/AutoMetric.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
class AutoMetric(ABC):
"""
Abstract base class for a metric to be applied to an AnnData object.
"""
"""
@abstractmethod
def metric(self, adata) -> dict:
"""
Run the metric and return a dictionary of results.
"""
pass

def run(self, adata):
"""
Handles execution + JSON serialization.
Expand Down
32 changes: 32 additions & 0 deletions benchmarking/auto_metrics/IntegrationMetrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# --- New metric class using scib-metrics ------------------------------------
from scib_metrics.benchmark import Benchmarker
from typing import Dict
import anndata
import numpy as np

Comment on lines +4 to +6
Copy link

Copilot AI Jul 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Neither anndata nor numpy are used in this module; consider removing these imports to reduce unused dependencies.

Suggested change
import anndata
import numpy as np

Copilot uses AI. Check for mistakes.
EMBED = "X_scVI" # The embedding key in adata.obsm
BATCH_KEY = "batch" # The batch key in adata.obs
LABEL_KEY = "cell_type" # The cell type key in adata.obs

class IntegrationMetric(AutoMetric):
"""
Compute SCIB integration quality metrics on an AnnData object using scib_metrics.
Returns a dictionary with three metrics:
• batch_silhouette: How well batches mix (lower ≈ better)
• celltype_silhouette: How well cell types separate (higher ≈ better)
• isolated_label_f1: Label preservation in isolated clusters (higher ≈ better)
"""
def metric(self, adata):
bm = Benchmarker(
adata,
batch_key=BATCH_KEY,
label_key=LABEL_KEY,
embedding_obsm_keys=[EMBED], # list of embeddings to evaluate
)
bm.prepare() # computes neighbors
bm.benchmark() # runs selected metrics
results = bm.get_results()

return results.to_dict()

IntegrationMetric().run(adata)
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"soma_joinid": 7,
"citation": "Publication: https://doi.org/10.1038/s41586-024-07944-6 Dataset Version: https://datasets.cellxgene.cziscience.com/463451bb-78a0-447f-9555-b05d11472d09.h5ad curated and distributed by CZ CELLxGENE Discover in Collection: https://cellxgene.cziscience.com/collections/fc19ae6c-d7c1-4dce-b703-62c5d52061b4",
"collection_id": "fc19ae6c-d7c1-4dce-b703-62c5d52061b4",
"collection_name": "A spatial human thymus cell atlas mapped to a continuous tissue axis",
"collection_doi": "10.1038/s41586-024-07944-6",
"collection_doi_label": "Yayon et al. (2024) Nature",
"dataset_id": "fbd69faa-b0c5-45ba-89c9-da938a7f5a14",
"dataset_version_id": "463451bb-78a0-447f-9555-b05d11472d09",
"dataset_title": "thymus scRNA-seq atlas - myeloid p2 subset",
"dataset_h5ad_path": "fbd69faa-b0c5-45ba-89c9-da938a7f5a14.h5ad",
"dataset_total_cell_count": 843
}
52 changes: 50 additions & 2 deletions benchmarking/prompt_testing/MultiAgentAutoTester.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,45 @@
SANDBOX_DATA_PATH = "/workspace/dataset.h5ad"
SANDBOX_RESOURCES_DIR = "/workspace/resources"

# ── Benchmark persistence --------------------------------------------------
from datetime import datetime
import pathlib, base64, json

timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S") # e.g. '20250708-174115'
_LEDGER_PATH = OUTPUTS_DIR / f"benchmark_history_{timestamp}.jsonl"
_SNIPPET_DIR = OUTPUTS_DIR / "snippets"
_SNIPPET_DIR.mkdir(exist_ok=True, parents=True)
_LEDGER_PATH.parent.mkdir(exist_ok=True, parents=True)

def _dump_code_snippet(run_id: str, code: str) -> str:
"""
Write <run_id>.py under outputs/snippets/ and return the relative path.
"""
snippet_path = _SNIPPET_DIR / f"{run_id}.py"
snippet_path.write_text(code, encoding="utf-8")
return str(snippet_path.relative_to(OUTPUTS_DIR))

def _save_benchmark_record(*, run_id: str, results: dict, meta: dict, code: str | None):
"""
Append a JSONL record containing timestamp, dataset metadata, metrics, and
a pointer to (or inline copy of) the integration code.
"""
record = {
"ts": datetime.utcnow().isoformat(timespec="seconds") + "Z",
"run": run_id,
"dataset": meta.get("name"),
"results": results,
}
if code:
# ↓ option A – path pointer (small, VCS-friendly)
record["code_path"] = _dump_code_snippet(run_id, code)

# ↓ option B – inline base64 (uncomment if you prefer one-file history)
# record["code_b64"] = base64.b64encode(code.encode()).decode()

with _LEDGER_PATH.open("a") as fh:
fh.write(json.dumps(record) + "\n")

# ===========================================================================
# 1 · Backend selection
# ===========================================================================
Expand Down Expand Up @@ -153,6 +192,7 @@ def run(
tries: int = 0,
):
"""Main driver"""
last_code_snippet: str | None = None
mgr = _BackendManager()
console.print(f"Launching sandbox ({backend})…")

Expand Down Expand Up @@ -218,6 +258,7 @@ def build_system(a: Agent) -> str:
# ── Inline code execution -------------------------------------------
code = extract_python_code(msg)
if code:
last_code_snippet = code
console.print("[cyan]Executing code…[/cyan]")
try:
if is_exec_mode:
Expand All @@ -235,7 +276,7 @@ def build_system(a: Agent) -> str:

# ── Automatic benchmarking (v1.2 addition) --------------------------
if benchmark_module:
result_str = run_benchmark(mgr, benchmark_module)
result_str = run_benchmark(mgr, benchmark_module, metadata, current_agent.name, last_code_snippet)
if result_str:
history.append({"role": "user", "content": result_str})
display(console, "user", result_str)
Expand Down Expand Up @@ -285,7 +326,8 @@ def get_benchmark_module(console: Console, parent_dir: Path) -> Optional[Path]:
return None


def run_benchmark(mgr, benchmark_module: Path) -> str:
def run_benchmark(mgr, benchmark_module: Path, metadata: dict,
agent_name: str, code_snippet: str | None) -> str:
"""Execute benchmark module and *return* a compact JSON string."""
console.print(f"\n[bold cyan]Running benchmark module: {benchmark_module.name}[/bold cyan]")
autometric_base_path = benchmark_module.parent / "AutoMetric.py"
Expand Down Expand Up @@ -329,6 +371,12 @@ def run_benchmark(mgr, benchmark_module: Path) -> str:
if exec_result.get("status") == "ok" and isinstance(result_dict, dict):
for key, value in result_dict.items():
table.add_row(str(key), str(value))
_save_benchmark_record(
run_id=f"{benchmark_module.stem}:{agent_name}:{int(time.time())}",
results=result_dict,
meta=metadata,
code=code_snippet, # ← NEW
)
else:
table.add_row("Error", exec_result.get("stderr") or "An unknown error occurred.")

Expand Down
36 changes: 20 additions & 16 deletions benchmarking/prompt_testing/MultiAgentTester.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,23 +214,27 @@ def build_system(a: Agent) -> str:

history.append({"role": "user", "content": feedback})
display(console, "user", feedback)

if benchmark_module:
console.print("\n[bold]Next message (blank = continue, 'benchmark' to run benchmarks, 'exit' to quit):[/bold]")
else:
console.print("\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]")
try:
user_in = input().strip()
except (EOFError, KeyboardInterrupt):
user_in = "exit"
if user_in.lower() in {"exit", "quit"}:

def input_loop():
if benchmark_module:
console.print("\n[bold]Next message (blank = continue, 'benchmark' to run benchmarks, 'exit' to quit):[/bold]")
else:
console.print("\n[bold]Next message (blank = continue, 'exit' to quit):[/bold]")
try:
user_in = input().strip()
except (EOFError, KeyboardInterrupt):
user_in = "exit"
if user_in.lower() in {"exit", "quit"}:
return "break"
if user_in.lower() == "benchmark" and benchmark_module:
run_benchmark(mgr, benchmark_module)
input_loop() # Recurse to continue the loop after benchmarks
Copy link

Copilot AI Jul 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The recursive call to input_loop() is missing a return, so the result isn't propagated back and may lead to incorrect control flow. Change to return input_loop().

Suggested change
input_loop() # Recurse to continue the loop after benchmarks
return input_loop() # Recurse to continue the loop after benchmarks

Copilot uses AI. Check for mistakes.
if user_in:
history.append({"role": "user", "content": user_in})
display(console, "user", user_in)
input_val = input_loop()
if input_val == "break": # User chose to exit
break
if user_in.lower() == "benchmark" and benchmark_module:
run_benchmark(mgr, benchmark_module)
continue
if user_in:
history.append({"role": "user", "content": user_in})
display(console, "user", user_in)

console.print("Stopping sandbox…")
mgr.stop_container()
Expand Down
Binary file not shown.
2 changes: 1 addition & 1 deletion benchmarking/sandbox/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,4 @@ harmonypy

# Additional Tools
rapids-singlecell
scib
scib-metrics