commit-0 · wenting-zhao · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024
diff --git a/.github/workflows/system.yml b/.github/workflows/system.yml
@@ -26,3 +26,5 @@ jobs:
         run: uv run commit0 get-tests simpy
       - name: Test
         run: uv run commit0 test-reference simpy tests/test_event.py::test_succeed
+      - name: Evaluate
+        run: uv run commit0 evaluate-reference simpy
diff --git a/commit0/__main__.py b/commit0/__main__.py
@@ -2,6 +2,7 @@
 import commit0.harness.get_pytest_ids
 import commit0.harness.build
 import commit0.harness.setup
+import commit0.harness.evaluate
 import copy
 import sys
 import os
@@ -28,7 +29,7 @@ def main() -> None:
     # after hydra gets all configs, put command-line arguments back
     sys.argv = sys_argv
     # repo_split: split from command line has a higher priority than split in hydra
-    if command in ["clone", "build"]:
+    if command in ["clone", "build", "evaluate", "evaluate-reference"]:
         if len(sys.argv) == 3:
             if sys.argv[2] not in SPLIT:
                 raise ValueError(
@@ -53,7 +54,7 @@ def main() -> None:
         )
     elif command == "get-tests":
         repo = sys.argv[2]
-        commit0.harness.get_pytest_ids.main(repo)
+        commit0.harness.get_pytest_ids.main(repo, stdout=True)
     elif command == "test" or command == "test-reference":
         repo = sys.argv[2]
         test_ids = sys.argv[3]
@@ -68,6 +69,20 @@ def main() -> None:
             test_ids,
             config.backend,
             config.timeout,
+            stdout=True,
+        )
+    elif command == "evaluate" or command == "evaluate-reference":
+        if command == "evaluate-reference":
+            config.branch = "reference"
+        commit0.harness.evaluate.main(
+            config.dataset_name,
+            config.dataset_split,
+            config.repo_split,
+            config.base_dir,
+            config.branch,
+            config.backend,
+            config.timeout,
+            config.num_workers,
         )
 
 

diff --git a/commit0/harness/constants.py b/commit0/harness/constants.py
@@ -26,7 +26,15 @@ class RepoInstance(TypedDict):
 EVAL_BACKENDS = ["local", "modal"]
 
 # available commands
-COMMANDS = ["clone", "build", "test", "test-reference", "get-tests"]
+COMMANDS = [
+    "clone",
+    "build",
+    "test",
+    "test-reference",
+    "get-tests",
+    "evaluate",
+    "evaluate-reference",
+]
 # repo splits
 SPLIT_MINITORCH = ["minitorch"]
 SPLIT_SIMPY = ["simpy"]
@@ -80,7 +88,8 @@ class RepoInstance(TypedDict):
     "mimesis",
     "babel",
     "dnspython",
-    "portalocker," "cookiecutter",
+    "portalocker",
+    "cookiecutter",
     "pyjwt",
     "python-rsa",
     "more-itertools",

diff --git a/commit0/harness/evaluate.py b/commit0/harness/evaluate.py
@@ -0,0 +1,126 @@
+import logging
+import os
+import traceback
+from collections import Counter
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datasets import load_dataset
+from tqdm import tqdm
+from typing import Iterator
+
+from commit0.harness.run_pytest_ids import main as run_tests
+from commit0.harness.get_pytest_ids import main as get_tests
+from commit0.harness.constants import RepoInstance, SPLIT
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+
+def main(
+    dataset_name: str,
+    dataset_split: str,
+    repo_split: str,
+    base_dir: str,
+    branch: str,
+    backend: str,
+    timeout: int,
+    num_workers: int,
+) -> None:
+    dataset: Iterator[RepoInstance] = load_dataset(dataset_name, split=dataset_split)  # type: ignore
+    repos = SPLIT[repo_split]
+    pairs = []
+    for example in dataset:
+        repo_name = example["repo"].split("/")[-1]
+        if repo_split != "all" and repo_name not in SPLIT[repo_split]:
+            continue
+        pairs.append((repo_name, example["test"]["test_dir"]))
+
+    log_dirs = []
+    with tqdm(total=len(repos), smoothing=0, desc="Evaluating repos") as pbar:
+        with ThreadPoolExecutor(max_workers=num_workers) as executor:
+            # Create a future for running each instance
+            futures = {
+                executor.submit(
+                    run_tests,
+                    dataset_name,
+                    dataset_split,
+                    base_dir,
+                    repo,
+                    branch,
+                    test_dir,
+                    backend,
+                    timeout,
+                    stdout=False,
+                ): None
+                for repo, test_dir in pairs
+            }
+            # Wait for each future to complete
+            for future in as_completed(futures):
+                pbar.update(1)
+                try:
+                    # Update progress bar, check if instance ran successfully
+                    result = future.result()
+                    log_dirs.append(result)
+                except Exception:
+                    traceback.print_exc()
+                    continue
+
+    # get numbers
+    out = []
+    for name in tqdm(log_dirs):
+        report_file = os.path.join(name, "report.json")
+        name = name.split("/")[2]
+        if not os.path.exists(report_file):
+            out.append(
+                {
+                    "name": name,
+                    "sum": 0,
+                    "passed": 0,
+                    "num_passed": 0,
+                }
+            )
+            continue
+        report = load_dataset("json", data_files=report_file, split="train")  # type: ignore
+        test_ids = get_tests(name, stdout=False)
+        tests = {x["nodeid"]: x["call"] for x in report["tests"][0]}  # type: ignore
+        status = []
+        runtimes = []
+        no_runs = 0
+        for test_id in test_ids:
+            if test_id in tests and tests[test_id] is not None:
+                status.append(tests[test_id]["outcome"])
+                runtimes.append(tests[test_id]["duration"])
+                no_runs += 1
+            else:
+                status.append("failed")
+                runtimes.append(0)
+        status = Counter(status)
+        if no_runs == 0:
+            total = 0
+        else:
+            total = sum(runtimes)
+        if "xfail" not in status:
+            status["xfail"] = 0
+        passed = (status["passed"] + status["xfail"]) / sum(status.values())
+        out.append(
+            {
+                "name": name,
+                "sum": total,
+                "passed": passed,
+                "num_passed": status["passed"] + status["xfail"],
+                "num_tests": sum(status.values()),
+            }
+        )
+    print("repo,runtime,num_passed/num_tests")
+    out = sorted(out, key=lambda x: x["sum"], reverse=True)
+    for x in out:
+        print(f"{x['name']},{x['sum']},{x['num_passed']}/{x['num_tests']}")
+    total_runtime = sum([x["sum"] for x in out])
+    averaged_passed = sum([x["passed"] for x in out]) / len(out)
+    print(f"total runtime: {total_runtime}")
+    print(f"average pass rate: {averaged_passed}")
+
+
+__all__ = []
diff --git a/commit0/harness/get_pytest_ids.py b/commit0/harness/get_pytest_ids.py
@@ -1,16 +1,22 @@
 import tarfile
+from typing import List
 
 
-def main(repo: str) -> None:
+def main(repo: str, stdout: bool) -> List[str]:
     repo = repo.lower()
     repo = repo.replace(".", "-")
+    out = ""
     with tarfile.open(f"commit0/data/test_ids/{repo}.tar.bz2", "r:bz2") as tar:
         for member in tar.getmembers():
             if member.isfile():
                 file = tar.extractfile(member)
                 if file:
-                    content = file.read()
-                    print(content.decode("utf-8"))
+                    content = file.read().decode("utf-8")
+                    out += content
+                    if stdout:
+                        print(content)
+    out = out.split("\n")
+    return out
 
 
 __all__ = []
diff --git a/commit0/harness/run_pytest_ids.py b/commit0/harness/run_pytest_ids.py
@@ -38,7 +38,12 @@ class ExecutionBackend(StrEnum):
 
 
 def run_docker(
-    spec: Spec, logger: logging.Logger, eval_file: Path, timeout: int, log_dir: Path
+    spec: Spec,
+    logger: logging.Logger,
+    eval_file: Path,
+    timeout: int,
+    log_dir: Path,
+    stdout: bool,
 ) -> None:
     client = docker.from_env()
     container = None
@@ -65,7 +70,8 @@ def run_docker(
             output, "--json-report --json-report-file=report.json"
         )
         # stdout might be more straightforward
-        print(test_output)
+        if stdout:
+            print(test_output)
         test_output_path = log_dir / "test_output.txt"
         with open(test_output_path, "w") as f:
             f.write(test_output)
@@ -105,7 +111,12 @@ def run_docker(
 
 
 def run_modal(
-    spec: Spec, logger: logging.Logger, eval_file: Path, timeout: int, log_dir: Path
+    spec: Spec,
+    logger: logging.Logger,
+    eval_file: Path,
+    timeout: int,
+    log_dir: Path,
+    stdout: bool,
 ) -> None:
     # get image name to pull from dockerhub
     # spec.repo_image_key
@@ -182,7 +193,8 @@ def run_modal(
         )
 
         # stdout might be more straightforward
-        print(test_output)
+        if stdout:
+            print(test_output)
         test_output_path = log_dir / "test_output.txt"
         with open(test_output_path, "w") as f:
             f.write(test_output)
@@ -204,7 +216,8 @@ def main(
     test_ids: str,
     backend: str,
     timeout: int,
-) -> None:
+    stdout: bool,
+) -> str:
     dataset: Iterator[RepoInstance] = load_dataset(dataset_name, split=dataset_split)  # type: ignore
     spec = None
     example = None
@@ -217,7 +230,7 @@ def main(
 
     hashed_test_ids = get_hash_string(test_ids)
     # set up logging
-    log_dir = RUN_PYTEST_LOG_DIR / repo / hashed_test_ids
+    log_dir = RUN_PYTEST_LOG_DIR / repo / branch / hashed_test_ids
     log_dir.mkdir(parents=True, exist_ok=True)
     log_file = log_dir / "run_pytest.log"
     logger = setup_logger(repo, log_file)
@@ -241,9 +254,10 @@ def main(
     eval_file.write_text(eval_script)
 
     if ExecutionBackend(backend) == ExecutionBackend.LOCAL:
-        run_docker(spec, logger, eval_file, timeout, log_dir)
+        run_docker(spec, logger, eval_file, timeout, log_dir, stdout)
     elif ExecutionBackend(backend) == ExecutionBackend.MODAL:
-        run_modal(spec, logger, eval_file, timeout, log_dir)
+        run_modal(spec, logger, eval_file, timeout, log_dir, stdout)
+    return str(log_dir)
 
 
 __all__ = []