Move evals

umanda · Nov 30, 2023 · 37b91f1 · 37b91f1
1 parent d2988c1
commit 37b91f1
Show file tree

Hide file tree

Showing 19 changed files with 147 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -51,12 +51,6 @@ scratchpad
 # Pyenv
 .python-version
 
-# Benchmark files
-benchmark
-!benchmark/*/prompt
-
-.gpte_consent
-
 .gpte_consent
 
 # projects folder apart from default prompt

diff --git a/gpt_engineer/benchmark/__main__.py b/gpt_engineer/benchmark/__main__.py
@@ -0,0 +1,13 @@
+import typer
+
+
+def main(
+    benchmarks: list[str],
+    path_to_agent: str,
+    task_name: str | None = None,
+):
+    benchmarks = [benchmark for benchmark in benchmarks.split(",")]
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/evals/__init__.py → ...enchmark/benchmarks/gpte_eval/__init__.py b/evals/__init__.py → ...enchmark/benchmarks/gpte_eval/__init__.py
diff --git a/evals/eval_tools.py → ...chmark/benchmarks/gpte_eval/eval_tools.py b/evals/eval_tools.py → ...chmark/benchmarks/gpte_eval/eval_tools.py
diff --git a/evals/EVAL_NEW_CODE_RESULTS.md → .../gpte_eval/evals/EVAL_NEW_CODE_RESULTS.md b/evals/EVAL_NEW_CODE_RESULTS.md → .../gpte_eval/evals/EVAL_NEW_CODE_RESULTS.md
diff --git a/evals/IMPROVE_CODE_RESULTS.md → ...s/gpte_eval/evals/IMPROVE_CODE_RESULTS.md b/evals/IMPROVE_CODE_RESULTS.md → ...s/gpte_eval/evals/IMPROVE_CODE_RESULTS.md
diff --git a/evals/README.md → ...mark/benchmarks/gpte_eval/evals/README.md b/evals/README.md → ...mark/benchmarks/gpte_eval/evals/README.md
diff --git a/evals/known_code_blobs/snake_game_files.txt → ...als/known_code_blobs/snake_game_files.txt b/evals/known_code_blobs/snake_game_files.txt → ...als/known_code_blobs/snake_game_files.txt
diff --git a/evals/known_code_blobs/web_todo_files.txt → ...evals/known_code_blobs/web_todo_files.txt b/evals/known_code_blobs/web_todo_files.txt → ...evals/known_code_blobs/web_todo_files.txt
diff --git a/evals/evals_existing_code.py → ...nchmarks/gpte_eval/evals_existing_code.py b/evals/evals_existing_code.py → ...nchmarks/gpte_eval/evals_existing_code.py
diff --git a/evals/evals_new_code.py → ...rk/benchmarks/gpte_eval/evals_new_code.py b/evals/evals_new_code.py → ...rk/benchmarks/gpte_eval/evals_new_code.py
diff --git a/evals/existing_code_eval.yaml → ...chmarks/gpte_eval/existing_code_eval.yaml b/evals/existing_code_eval.yaml → ...chmarks/gpte_eval/existing_code_eval.yaml
diff --git a/gpt_engineer/benchmark/benchmarks/gpte_eval/load.py b/gpt_engineer/benchmark/benchmarks/gpte_eval/load.py
@@ -0,0 +1,13 @@
+from gpt_engineer.benchmark.benchmarks.default.eval_tools import (
+    load_evaluations_from_file,
+)
+from gpt_engineer.benchmark.types import Benchmark
+
+
+def load_gpte_eval():
+    evals = load_evaluations_from_file("new_code_eval.yaml")
+    evals = load_evaluations_from_file("existing_code_eval.yaml")
+
+    return Benchmark(
+        "gpte_eval",
+    )
diff --git a/evals/new_code_eval.yaml → ...k/benchmarks/gpte_eval/new_code_eval.yaml b/evals/new_code_eval.yaml → ...k/benchmarks/gpte_eval/new_code_eval.yaml
diff --git a/gpt_engineer/benchmark/benchmarks/gptme/load.py b/gpt_engineer/benchmark/benchmarks/gptme/load.py
@@ -0,0 +1,56 @@
+tests: list[ExecTest] = [
+    {
+        "name": "hello",
+        "files": {"hello.py": "print('Hello, world!')"},
+        "run": "python hello.py",
+        "prompt": "Change the code in hello.py to print 'Hello, human!'",
+        "expect": {
+            "correct output": lambda ctx: ctx.stdout == "Hello, human!\n",
+            "correct file": lambda ctx: ctx.files["hello.py"].strip()
+            == "print('Hello, human!')",
+        },
+    },
+    {
+        "name": "hello-patch",
+        "files": {"hello.py": "print('Hello, world!')"},
+        "run": "python hello.py",
+        "prompt": "Patch the code in hello.py to print 'Hello, human!'",
+        "expect": {
+            "correct output": lambda ctx: ctx.stdout == "Hello, human!\n",
+            "correct file": lambda ctx: ctx.files["hello.py"].strip()
+            == "print('Hello, human!')",
+        },
+    },
+    {
+        "name": "hello-ask",
+        "files": {"hello.py": "print('Hello, world!')"},
+        "run": "echo 'Erik' | python hello.py",
+        # TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode
+        "prompt": "modify hello.py to ask the user for their name and print 'Hello, <name>!'. don't try to execute it",
+        "expect": {
+            "correct output": lambda ctx: "Hello, Erik!" in ctx.stdout,
+        },
+    },
+    {
+        "name": "prime100",
+        "files": {},
+        "run": "python prime.py",
+        "prompt": "write a script prime.py that computes and prints the 100th prime number",
+        "expect": {
+            "correct output": lambda ctx: "541" in ctx.stdout.split(),
+        },
+    },
+    {
+        "name": "init-git",
+        "files": {},
+        "run": "git status",
+        "prompt": "initialize a git repository, write a main.py file, and commit it",
+        "expect": {
+            "clean exit": lambda ctx: ctx.exit_code == 0,
+            "clean working tree": lambda ctx: "nothing to commit, working tree clean"
+            in ctx.stdout,
+            "main.py exists": lambda ctx: "main.py" in ctx.files,
+            "we have a commit": lambda ctx: "No commits yet" not in ctx.stdout,
+        },
+    },
+]
diff --git a/gpt_engineer/benchmark/benchmarks/load.py b/gpt_engineer/benchmark/benchmarks/load.py
@@ -0,0 +1,11 @@
+from gpt_engineer.benchmark.types import Benchmark
+
+BENCHMARKS = {
+    "default": load_gpte_eval,
+}
+
+
+def get_benchmark(name: str) -> Benchmark:
+    if name not in BENCHMARKS:
+        raise ValueError(f"Unknown benchmark {name}.")
+    return BENCHMARKS[name]()
diff --git a/gpt_engineer/benchmark/run.py b/gpt_engineer/benchmark/run.py
@@ -0,0 +1,23 @@
+import typer
+
+from gpt_engineer.benchmark.benchmark import Benchmark, TaskResult
+from gpt_engineer.core.base_agent import Agent
+from gpt_engineer.core.default.on_disk_execution_env import OnDiskExecutionEnv
+
+
+def eval(benchmark: Benchmark, agent: Agent, task_name: str | None = None):
+    env = OnDiskExecutionEnv()
+
+    task_results = []
+    for task in benchmark.tasks:
+        code = agent.improve(task.initial_code, task.prompt, task.command)
+        task_results.append(
+            TaskResult(
+                task_name=task.name,
+                assertion_results={
+                    assertion_name: assertion(code, env)
+                    for assertion_name, assertion in task.assertions.items()
+                },
+            )
+        )
+    return task_results
diff --git a/gpt_engineer/benchmark/types.py b/gpt_engineer/benchmark/types.py
@@ -0,0 +1,31 @@
+from dataclasses import dataclass
+from typing import Callable
+
+from gpt_engineer.core.base_execution_env import ExecutionEnv
+from gpt_engineer.core.code import Files
+
+Assertion = Callable[[Files, ExecutionEnv], bool]
+
+
+@dataclass
+class Task:
+    name: str
+    initial_code: Files | None
+    command: str | None
+    prompt: str
+    assertions: dict[str, Assertion] | None
+
+
+@dataclass
+class Benchmark:
+    """A benchmark is a collection of tasks that evaluate a model's performance."""
+
+    name: str
+    tasks: list[Task]
+
+
+@dataclass
+class TaskResult:
+    task_name: str
+    assertion_results: dict[str, bool]
+    duration: float
diff --git a/scripts/benchmark.py → scripts/legacy_benchmark.py b/scripts/benchmark.py → scripts/legacy_benchmark.py