Skip to content

Evaluate #21

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/system.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ jobs:
run: uv run commit0 get-tests simpy
- name: Test
run: uv run commit0 test-reference simpy tests/test_event.py::test_succeed
- name: Evaluate
run: uv run commit0 evaluate-reference simpy
19 changes: 17 additions & 2 deletions commit0/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import commit0.harness.get_pytest_ids
import commit0.harness.build
import commit0.harness.setup
import commit0.harness.evaluate
import copy
import sys
import os
Expand All @@ -28,7 +29,7 @@ def main() -> None:
# after hydra gets all configs, put command-line arguments back
sys.argv = sys_argv
# repo_split: split from command line has a higher priority than split in hydra
if command in ["clone", "build"]:
if command in ["clone", "build", "evaluate", "evaluate-reference"]:
if len(sys.argv) == 3:
if sys.argv[2] not in SPLIT:
raise ValueError(
Expand All @@ -53,7 +54,7 @@ def main() -> None:
)
elif command == "get-tests":
repo = sys.argv[2]
commit0.harness.get_pytest_ids.main(repo)
commit0.harness.get_pytest_ids.main(repo, stdout=True)
elif command == "test" or command == "test-reference":
repo = sys.argv[2]
test_ids = sys.argv[3]
Expand All @@ -68,6 +69,20 @@ def main() -> None:
test_ids,
config.backend,
config.timeout,
stdout=True,
)
elif command == "evaluate" or command == "evaluate-reference":
if command == "evaluate-reference":
config.branch = "reference"
commit0.harness.evaluate.main(
config.dataset_name,
config.dataset_split,
config.repo_split,
config.base_dir,
config.branch,
config.backend,
config.timeout,
config.num_workers,
)


Expand Down
13 changes: 11 additions & 2 deletions commit0/harness/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,15 @@ class RepoInstance(TypedDict):
EVAL_BACKENDS = ["local", "modal"]

# available commands
COMMANDS = ["clone", "build", "test", "test-reference", "get-tests"]
COMMANDS = [
"clone",
"build",
"test",
"test-reference",
"get-tests",
"evaluate",
"evaluate-reference",
]
# repo splits
SPLIT_MINITORCH = ["minitorch"]
SPLIT_SIMPY = ["simpy"]
Expand Down Expand Up @@ -80,7 +88,8 @@ class RepoInstance(TypedDict):
"mimesis",
"babel",
"dnspython",
"portalocker," "cookiecutter",
"portalocker",
"cookiecutter",
"pyjwt",
"python-rsa",
"more-itertools",
Expand Down
126 changes: 126 additions & 0 deletions commit0/harness/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import logging
import os
import traceback
from collections import Counter

from concurrent.futures import ThreadPoolExecutor, as_completed
from datasets import load_dataset
from tqdm import tqdm
from typing import Iterator

from commit0.harness.run_pytest_ids import main as run_tests
from commit0.harness.get_pytest_ids import main as get_tests
from commit0.harness.constants import RepoInstance, SPLIT

logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def main(
dataset_name: str,
dataset_split: str,
repo_split: str,
base_dir: str,
branch: str,
backend: str,
timeout: int,
num_workers: int,
) -> None:
dataset: Iterator[RepoInstance] = load_dataset(dataset_name, split=dataset_split) # type: ignore
repos = SPLIT[repo_split]
pairs = []
for example in dataset:
repo_name = example["repo"].split("/")[-1]
if repo_split != "all" and repo_name not in SPLIT[repo_split]:
continue
pairs.append((repo_name, example["test"]["test_dir"]))

log_dirs = []
with tqdm(total=len(repos), smoothing=0, desc="Evaluating repos") as pbar:
with ThreadPoolExecutor(max_workers=num_workers) as executor:
# Create a future for running each instance
futures = {
executor.submit(
run_tests,
dataset_name,
dataset_split,
base_dir,
repo,
branch,
test_dir,
backend,
timeout,
stdout=False,
): None
for repo, test_dir in pairs
}
# Wait for each future to complete
for future in as_completed(futures):
pbar.update(1)
try:
# Update progress bar, check if instance ran successfully
result = future.result()
log_dirs.append(result)
except Exception:
traceback.print_exc()
continue

# get numbers
out = []
for name in tqdm(log_dirs):
report_file = os.path.join(name, "report.json")
name = name.split("/")[2]
if not os.path.exists(report_file):
out.append(
{
"name": name,
"sum": 0,
"passed": 0,
"num_passed": 0,
}
)
continue
report = load_dataset("json", data_files=report_file, split="train") # type: ignore
test_ids = get_tests(name, stdout=False)
tests = {x["nodeid"]: x["call"] for x in report["tests"][0]} # type: ignore
status = []
runtimes = []
no_runs = 0
for test_id in test_ids:
if test_id in tests and tests[test_id] is not None:
status.append(tests[test_id]["outcome"])
runtimes.append(tests[test_id]["duration"])
no_runs += 1
else:
status.append("failed")
runtimes.append(0)
status = Counter(status)
if no_runs == 0:
total = 0
else:
total = sum(runtimes)
if "xfail" not in status:
status["xfail"] = 0
passed = (status["passed"] + status["xfail"]) / sum(status.values())
out.append(
{
"name": name,
"sum": total,
"passed": passed,
"num_passed": status["passed"] + status["xfail"],
"num_tests": sum(status.values()),
}
)
print("repo,runtime,num_passed/num_tests")
out = sorted(out, key=lambda x: x["sum"], reverse=True)
for x in out:
print(f"{x['name']},{x['sum']},{x['num_passed']}/{x['num_tests']}")
total_runtime = sum([x["sum"] for x in out])
averaged_passed = sum([x["passed"] for x in out]) / len(out)
print(f"total runtime: {total_runtime}")
print(f"average pass rate: {averaged_passed}")


__all__ = []
12 changes: 9 additions & 3 deletions commit0/harness/get_pytest_ids.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
import tarfile
from typing import List


def main(repo: str) -> None:
def main(repo: str, stdout: bool) -> List[str]:
repo = repo.lower()
repo = repo.replace(".", "-")
out = ""
with tarfile.open(f"commit0/data/test_ids/{repo}.tar.bz2", "r:bz2") as tar:
for member in tar.getmembers():
if member.isfile():
file = tar.extractfile(member)
if file:
content = file.read()
print(content.decode("utf-8"))
content = file.read().decode("utf-8")
out += content
if stdout:
print(content)
out = out.split("\n")
return out


__all__ = []
30 changes: 22 additions & 8 deletions commit0/harness/run_pytest_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,12 @@ class ExecutionBackend(StrEnum):


def run_docker(
spec: Spec, logger: logging.Logger, eval_file: Path, timeout: int, log_dir: Path
spec: Spec,
logger: logging.Logger,
eval_file: Path,
timeout: int,
log_dir: Path,
stdout: bool,
) -> None:
client = docker.from_env()
container = None
Expand All @@ -65,7 +70,8 @@ def run_docker(
output, "--json-report --json-report-file=report.json"
)
# stdout might be more straightforward
print(test_output)
if stdout:
print(test_output)
test_output_path = log_dir / "test_output.txt"
with open(test_output_path, "w") as f:
f.write(test_output)
Expand Down Expand Up @@ -105,7 +111,12 @@ def run_docker(


def run_modal(
spec: Spec, logger: logging.Logger, eval_file: Path, timeout: int, log_dir: Path
spec: Spec,
logger: logging.Logger,
eval_file: Path,
timeout: int,
log_dir: Path,
stdout: bool,
) -> None:
# get image name to pull from dockerhub
# spec.repo_image_key
Expand Down Expand Up @@ -182,7 +193,8 @@ def run_modal(
)

# stdout might be more straightforward
print(test_output)
if stdout:
print(test_output)
test_output_path = log_dir / "test_output.txt"
with open(test_output_path, "w") as f:
f.write(test_output)
Expand All @@ -204,7 +216,8 @@ def main(
test_ids: str,
backend: str,
timeout: int,
) -> None:
stdout: bool,
) -> str:
dataset: Iterator[RepoInstance] = load_dataset(dataset_name, split=dataset_split) # type: ignore
spec = None
example = None
Expand All @@ -217,7 +230,7 @@ def main(

hashed_test_ids = get_hash_string(test_ids)
# set up logging
log_dir = RUN_PYTEST_LOG_DIR / repo / hashed_test_ids
log_dir = RUN_PYTEST_LOG_DIR / repo / branch / hashed_test_ids
log_dir.mkdir(parents=True, exist_ok=True)
log_file = log_dir / "run_pytest.log"
logger = setup_logger(repo, log_file)
Expand All @@ -241,9 +254,10 @@ def main(
eval_file.write_text(eval_script)

if ExecutionBackend(backend) == ExecutionBackend.LOCAL:
run_docker(spec, logger, eval_file, timeout, log_dir)
run_docker(spec, logger, eval_file, timeout, log_dir, stdout)
elif ExecutionBackend(backend) == ExecutionBackend.MODAL:
run_modal(spec, logger, eval_file, timeout, log_dir)
run_modal(spec, logger, eval_file, timeout, log_dir, stdout)
return str(log_dir)


__all__ = []
Loading