Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# syntax=docker/dockerfile:1
FROM ghcr.io/astral-sh/uv:python3.13-bookworm

# Install Node.js, npm, and Chromium dependencies for mermaid-cli
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
fonts-liberation \
fonts-noto-color-emoji \
libasound2 \
libatk-bridge2.0-0 \
libatk1.0-0 \
libatspi2.0-0 \
libcairo2 \
libcups2 \
libdrm2 \
libgbm1 \
libgtk-3-0 \
libnss3 \
libpango-1.0-0 \
libx11-xcb1 \
libxcomposite1 \
libxdamage1 \
libxfixes3 \
libxi6 \
libxrandr2 \
libxrender1 \
libxshmfence1 \
libxss1 \
libxtst6 \
nodejs \
npm \
&& rm -rf /var/lib/apt/lists/*

# Install mermaid CLI globally
RUN npm install -g @mermaid-js/mermaid-cli

# Set work directory
WORKDIR /app

# Copy dependency files first for better caching
COPY pyproject.toml uv.lock ./

# Copy source
COPY . .

# Sync dependencies during build so they are baked into the image
RUN uv sync --frozen

# Default entrypoint
ENTRYPOINT ["/app/docker/merbench-entrypoint.sh"]
12 changes: 11 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,14 @@ adk_basic_ui:
uv run adk web agents_mcp_usage/basic_mcp

adk_multi_ui:
uv run adk web agents_mcp_usage/multi_mcp
uv run adk web agents_mcp_usage/multi_mcp

merbench-docker-build:
docker build -t merbench .

merbench-docker-run:
docker run --rm \
-e GEMINI_API_KEY=$${GEMINI_API_KEY} \
-e OPENAI_API_KEY=$${OPENAI_API_KEY} \
-v "$$(pwd)/mermaid_eval_results:/app/mermaid_eval_results" \
merbench $${ARGS}
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,34 @@ Run an Agent framework script e.g.:

Check console, Logfire, or the ADK web UI for output

## Docker quickstart

Build the pre-baked evaluation image (includes Python dependencies and the Mermaid CLI):

```bash
docker build -t merbench .
```

Run the multi-model Mermaid benchmark with your API keys and a bind mount so results persist on the host:

```bash
docker run --rm \
-e GEMINI_API_KEY="your-gemini-key" \
-e OPENAI_API_KEY="your-openai-key" \
-v "$(pwd)/mermaid_eval_results:/app/mermaid_eval_results" \
merbench --models gemini-1.5-pro,openai-gpt-4.1-mini
```

The container entrypoint defaults to `run_multi_evals.py`. Override it to launch other tooling, such as the evaluation UI:

```bash
docker run --rm \
-e GEMINI_API_KEY="your-gemini-key" \
-v "$(pwd)/mermaid_eval_results:/app/mermaid_eval_results" \
--entrypoint uv \
merbench run agents_mcp_usage/evaluations/mermaid_evals/merbench_ui.py
```

## Project Overview

This project aims to teach:
Expand Down
7 changes: 7 additions & 0 deletions docker/merbench-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/sh
set -e

# Ensure results directory exists if bind-mounted
mkdir -p /app/mermaid_eval_results

exec uv run agents_mcp_usage/evaluations/mermaid_evals/run_multi_evals.py "$@"