Skip to content

Commit

Permalink
Move evals
Browse files Browse the repository at this point in the history
  • Loading branch information
AntonOsika committed Nov 30, 2023
1 parent d2988c1 commit 37b91f1
Show file tree
Hide file tree
Showing 19 changed files with 147 additions and 6 deletions.
6 changes: 0 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,6 @@ scratchpad
# Pyenv
.python-version

# Benchmark files
benchmark
!benchmark/*/prompt

.gpte_consent

.gpte_consent

# projects folder apart from default prompt
Expand Down
13 changes: 13 additions & 0 deletions gpt_engineer/benchmark/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import typer


def main(
benchmarks: list[str],
path_to_agent: str,
task_name: str | None = None,
):
benchmarks = [benchmark for benchmark in benchmarks.split(",")]


if __name__ == "__main__":
typer.run(main)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
13 changes: 13 additions & 0 deletions gpt_engineer/benchmark/benchmarks/gpte_eval/load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from gpt_engineer.benchmark.benchmarks.default.eval_tools import (
load_evaluations_from_file,
)
from gpt_engineer.benchmark.types import Benchmark


def load_gpte_eval():
evals = load_evaluations_from_file("new_code_eval.yaml")
evals = load_evaluations_from_file("existing_code_eval.yaml")

return Benchmark(
"gpte_eval",
)
File renamed without changes.
56 changes: 56 additions & 0 deletions gpt_engineer/benchmark/benchmarks/gptme/load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
tests: list[ExecTest] = [
{
"name": "hello",
"files": {"hello.py": "print('Hello, world!')"},
"run": "python hello.py",
"prompt": "Change the code in hello.py to print 'Hello, human!'",
"expect": {
"correct output": lambda ctx: ctx.stdout == "Hello, human!\n",
"correct file": lambda ctx: ctx.files["hello.py"].strip()
== "print('Hello, human!')",
},
},
{
"name": "hello-patch",
"files": {"hello.py": "print('Hello, world!')"},
"run": "python hello.py",
"prompt": "Patch the code in hello.py to print 'Hello, human!'",
"expect": {
"correct output": lambda ctx: ctx.stdout == "Hello, human!\n",
"correct file": lambda ctx: ctx.files["hello.py"].strip()
== "print('Hello, human!')",
},
},
{
"name": "hello-ask",
"files": {"hello.py": "print('Hello, world!')"},
"run": "echo 'Erik' | python hello.py",
# TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode
"prompt": "modify hello.py to ask the user for their name and print 'Hello, <name>!'. don't try to execute it",
"expect": {
"correct output": lambda ctx: "Hello, Erik!" in ctx.stdout,
},
},
{
"name": "prime100",
"files": {},
"run": "python prime.py",
"prompt": "write a script prime.py that computes and prints the 100th prime number",
"expect": {
"correct output": lambda ctx: "541" in ctx.stdout.split(),
},
},
{
"name": "init-git",
"files": {},
"run": "git status",
"prompt": "initialize a git repository, write a main.py file, and commit it",
"expect": {
"clean exit": lambda ctx: ctx.exit_code == 0,
"clean working tree": lambda ctx: "nothing to commit, working tree clean"
in ctx.stdout,
"main.py exists": lambda ctx: "main.py" in ctx.files,
"we have a commit": lambda ctx: "No commits yet" not in ctx.stdout,
},
},
]
11 changes: 11 additions & 0 deletions gpt_engineer/benchmark/benchmarks/load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from gpt_engineer.benchmark.types import Benchmark

BENCHMARKS = {
"default": load_gpte_eval,
}


def get_benchmark(name: str) -> Benchmark:
if name not in BENCHMARKS:
raise ValueError(f"Unknown benchmark {name}.")
return BENCHMARKS[name]()
23 changes: 23 additions & 0 deletions gpt_engineer/benchmark/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import typer

from gpt_engineer.benchmark.benchmark import Benchmark, TaskResult
from gpt_engineer.core.base_agent import Agent
from gpt_engineer.core.default.on_disk_execution_env import OnDiskExecutionEnv


def eval(benchmark: Benchmark, agent: Agent, task_name: str | None = None):
env = OnDiskExecutionEnv()

task_results = []
for task in benchmark.tasks:
code = agent.improve(task.initial_code, task.prompt, task.command)
task_results.append(
TaskResult(
task_name=task.name,
assertion_results={
assertion_name: assertion(code, env)
for assertion_name, assertion in task.assertions.items()
},
)
)
return task_results
31 changes: 31 additions & 0 deletions gpt_engineer/benchmark/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from dataclasses import dataclass
from typing import Callable

from gpt_engineer.core.base_execution_env import ExecutionEnv
from gpt_engineer.core.code import Files

Assertion = Callable[[Files, ExecutionEnv], bool]


@dataclass
class Task:
name: str
initial_code: Files | None
command: str | None
prompt: str
assertions: dict[str, Assertion] | None


@dataclass
class Benchmark:
"""A benchmark is a collection of tasks that evaluate a model's performance."""

name: str
tasks: list[Task]


@dataclass
class TaskResult:
task_name: str
assertion_results: dict[str, bool]
duration: float
File renamed without changes.

0 comments on commit 37b91f1

Please sign in to comment.