forked from AntonOsika/gpt-engineer
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d2988c1
commit 37b91f1
Showing
19 changed files
with
147 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import typer | ||
|
||
|
||
def main( | ||
benchmarks: list[str], | ||
path_to_agent: str, | ||
task_name: str | None = None, | ||
): | ||
benchmarks = [benchmark for benchmark in benchmarks.split(",")] | ||
|
||
|
||
if __name__ == "__main__": | ||
typer.run(main) |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from gpt_engineer.benchmark.benchmarks.default.eval_tools import ( | ||
load_evaluations_from_file, | ||
) | ||
from gpt_engineer.benchmark.types import Benchmark | ||
|
||
|
||
def load_gpte_eval(): | ||
evals = load_evaluations_from_file("new_code_eval.yaml") | ||
evals = load_evaluations_from_file("existing_code_eval.yaml") | ||
|
||
return Benchmark( | ||
"gpte_eval", | ||
) |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
tests: list[ExecTest] = [ | ||
{ | ||
"name": "hello", | ||
"files": {"hello.py": "print('Hello, world!')"}, | ||
"run": "python hello.py", | ||
"prompt": "Change the code in hello.py to print 'Hello, human!'", | ||
"expect": { | ||
"correct output": lambda ctx: ctx.stdout == "Hello, human!\n", | ||
"correct file": lambda ctx: ctx.files["hello.py"].strip() | ||
== "print('Hello, human!')", | ||
}, | ||
}, | ||
{ | ||
"name": "hello-patch", | ||
"files": {"hello.py": "print('Hello, world!')"}, | ||
"run": "python hello.py", | ||
"prompt": "Patch the code in hello.py to print 'Hello, human!'", | ||
"expect": { | ||
"correct output": lambda ctx: ctx.stdout == "Hello, human!\n", | ||
"correct file": lambda ctx: ctx.files["hello.py"].strip() | ||
== "print('Hello, human!')", | ||
}, | ||
}, | ||
{ | ||
"name": "hello-ask", | ||
"files": {"hello.py": "print('Hello, world!')"}, | ||
"run": "echo 'Erik' | python hello.py", | ||
# TODO: work around the "don't try to execute it" part by improving gptme such that it just gives EOF to stdin in non-interactive mode | ||
"prompt": "modify hello.py to ask the user for their name and print 'Hello, <name>!'. don't try to execute it", | ||
"expect": { | ||
"correct output": lambda ctx: "Hello, Erik!" in ctx.stdout, | ||
}, | ||
}, | ||
{ | ||
"name": "prime100", | ||
"files": {}, | ||
"run": "python prime.py", | ||
"prompt": "write a script prime.py that computes and prints the 100th prime number", | ||
"expect": { | ||
"correct output": lambda ctx: "541" in ctx.stdout.split(), | ||
}, | ||
}, | ||
{ | ||
"name": "init-git", | ||
"files": {}, | ||
"run": "git status", | ||
"prompt": "initialize a git repository, write a main.py file, and commit it", | ||
"expect": { | ||
"clean exit": lambda ctx: ctx.exit_code == 0, | ||
"clean working tree": lambda ctx: "nothing to commit, working tree clean" | ||
in ctx.stdout, | ||
"main.py exists": lambda ctx: "main.py" in ctx.files, | ||
"we have a commit": lambda ctx: "No commits yet" not in ctx.stdout, | ||
}, | ||
}, | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from gpt_engineer.benchmark.types import Benchmark | ||
|
||
BENCHMARKS = { | ||
"default": load_gpte_eval, | ||
} | ||
|
||
|
||
def get_benchmark(name: str) -> Benchmark: | ||
if name not in BENCHMARKS: | ||
raise ValueError(f"Unknown benchmark {name}.") | ||
return BENCHMARKS[name]() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import typer | ||
|
||
from gpt_engineer.benchmark.benchmark import Benchmark, TaskResult | ||
from gpt_engineer.core.base_agent import Agent | ||
from gpt_engineer.core.default.on_disk_execution_env import OnDiskExecutionEnv | ||
|
||
|
||
def eval(benchmark: Benchmark, agent: Agent, task_name: str | None = None): | ||
env = OnDiskExecutionEnv() | ||
|
||
task_results = [] | ||
for task in benchmark.tasks: | ||
code = agent.improve(task.initial_code, task.prompt, task.command) | ||
task_results.append( | ||
TaskResult( | ||
task_name=task.name, | ||
assertion_results={ | ||
assertion_name: assertion(code, env) | ||
for assertion_name, assertion in task.assertions.items() | ||
}, | ||
) | ||
) | ||
return task_results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from dataclasses import dataclass | ||
from typing import Callable | ||
|
||
from gpt_engineer.core.base_execution_env import ExecutionEnv | ||
from gpt_engineer.core.code import Files | ||
|
||
Assertion = Callable[[Files, ExecutionEnv], bool] | ||
|
||
|
||
@dataclass | ||
class Task: | ||
name: str | ||
initial_code: Files | None | ||
command: str | None | ||
prompt: str | ||
assertions: dict[str, Assertion] | None | ||
|
||
|
||
@dataclass | ||
class Benchmark: | ||
"""A benchmark is a collection of tasks that evaluate a model's performance.""" | ||
|
||
name: str | ||
tasks: list[Task] | ||
|
||
|
||
@dataclass | ||
class TaskResult: | ||
task_name: str | ||
assertion_results: dict[str, bool] | ||
duration: float |
File renamed without changes.