Merge remote-tracking branch 'origin/main' into modal-clean

justinchiu-cohere · justinchiu-cohere · commit 49c9333d52ff · 2024-09-13T21:06:58.000Z
diff --git a/.github/workflows/system.yml b/.github/workflows/system.yml
@@ -26,3 +26,10 @@ jobs:
         run: uv run commit0 get-tests simpy
       - name: Test
         run: uv run commit0 test-reference simpy tests/test_event.py::test_succeed
+      - name: Evaluate
+        run: uv run commit0 evaluate-reference simpy
+      - name: Save
+        env:
+          GITHUB_TOKEN: ${{ secrets.MY_GITHUB_TOKEN }}
+        run: |
+          uv run commit0 save simpy test-save-commit0
diff --git a/commit0/__main__.py b/commit0/__main__.py
@@ -2,6 +2,8 @@
 import commit0.harness.get_pytest_ids
 import commit0.harness.build
 import commit0.harness.setup
+import commit0.harness.evaluate
+import commit0.harness.save
 import copy
 import sys
 import os
@@ -19,7 +21,7 @@ def main() -> None:
         )
     # type check config values
     cs = ConfigStore.instance()
-    cs.store(name="user", node=Commit0Config)
+    cs.store(name="user", group="Commit0Config", node=Commit0Config)
     # have hydra to ignore all command-line arguments
     sys_argv = copy.deepcopy(sys.argv)
     sys.argv = [sys.argv[0]]
@@ -28,8 +30,8 @@ def main() -> None:
     # after hydra gets all configs, put command-line arguments back
     sys.argv = sys_argv
     # repo_split: split from command line has a higher priority than split in hydra
-    if command in ["clone", "build"]:
-        if len(sys.argv) == 3:
+    if command in ["clone", "build", "evaluate", "evaluate-reference", "save"]:
+        if len(sys.argv) >= 3:
             if sys.argv[2] not in SPLIT:
                 raise ValueError(
                     f"repo split must be from {', '.join(SPLIT.keys())}, but you provided {sys.argv[2]}"
@@ -43,6 +45,7 @@ def main() -> None:
             config.dataset_split,
             config.repo_split,
             config.base_dir,
+            config.branch,
         )
     elif command == "build":
         commit0.harness.build.main(
@@ -53,7 +56,7 @@ def main() -> None:
         )
     elif command == "get-tests":
         repo = sys.argv[2]
-        commit0.harness.get_pytest_ids.main(repo)
+        commit0.harness.get_pytest_ids.main(repo, stdout=True)
     elif command == "test" or command == "test-reference":
         repo = sys.argv[2]
         test_ids = sys.argv[3]
@@ -68,6 +71,31 @@ def main() -> None:
             test_ids,
             config.backend,
             config.timeout,
+            stdout=True,
+        )
+    elif command == "evaluate" or command == "evaluate-reference":
+        if command == "evaluate-reference":
+            config.branch = "reference"
+        commit0.harness.evaluate.main(
+            config.dataset_name,
+            config.dataset_split,
+            config.repo_split,
+            config.base_dir,
+            config.branch,
+            config.backend,
+            config.timeout,
+            config.num_workers,
+        )
+    elif command == "save":
+        organization = sys.argv[3]
+        commit0.harness.save.main(
+            config.dataset_name,
+            config.dataset_split,
+            config.repo_split,
+            config.base_dir,
+            organization,
+            config.branch,
+            config.github_token,
         )
 
 
diff --git a/commit0/configs/base.yaml b/commit0/configs/base.yaml
@@ -16,3 +16,6 @@ num_workers: 8
 backend: local
 branch: ai
 timeout: 1_800
+
+# save related
+github_token: null
diff --git a/commit0/configs/config_class.py b/commit0/configs/config_class.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+from typing import Optional
 
 
 @dataclass
@@ -21,3 +22,6 @@ class Commit0Config:
     branch: str
     # timeout for running pytest
     timeout: int
+
+    # save related
+    github_token: Optional[str]
diff --git a/commit0/harness/constants.py b/commit0/harness/constants.py
@@ -26,7 +26,16 @@ class RepoInstance(TypedDict):
 EVAL_BACKENDS = ["local", "modal"]
 
 # available commands
-COMMANDS = ["clone", "build", "test", "test-reference", "get-tests"]
+COMMANDS = [
+    "clone",
+    "build",
+    "test",
+    "test-reference",
+    "get-tests",
+    "evaluate",
+    "evaluate-reference",
+    "save",
+]
 # repo splits
 SPLIT_MINITORCH = ["minitorch"]
 SPLIT_SIMPY = ["simpy"]
@@ -80,7 +89,8 @@ class RepoInstance(TypedDict):
     "mimesis",
     "babel",
     "dnspython",
-    "portalocker," "cookiecutter",
+    "portalocker",
+    "cookiecutter",
     "pyjwt",
     "python-rsa",
     "more-itertools",
diff --git a/commit0/harness/evaluate.py b/commit0/harness/evaluate.py
@@ -0,0 +1,126 @@
+import logging
+import os
+import traceback
+from collections import Counter
+
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datasets import load_dataset
+from tqdm import tqdm
+from typing import Iterator
+
+from commit0.harness.run_pytest_ids import main as run_tests
+from commit0.harness.get_pytest_ids import main as get_tests
+from commit0.harness.constants import RepoInstance, SPLIT
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+
+def main(
+    dataset_name: str,
+    dataset_split: str,
+    repo_split: str,
+    base_dir: str,
+    branch: str,
+    backend: str,
+    timeout: int,
+    num_workers: int,
+) -> None:
+    dataset: Iterator[RepoInstance] = load_dataset(dataset_name, split=dataset_split)  # type: ignore
+    repos = SPLIT[repo_split]
+    pairs = []
+    for example in dataset:
+        repo_name = example["repo"].split("/")[-1]
+        if repo_split != "all" and repo_name not in SPLIT[repo_split]:
+            continue
+        pairs.append((repo_name, example["test"]["test_dir"]))
+
+    log_dirs = []
+    with tqdm(total=len(repos), smoothing=0, desc="Evaluating repos") as pbar:
+        with ThreadPoolExecutor(max_workers=num_workers) as executor:
+            # Create a future for running each instance
+            futures = {
+                executor.submit(
+                    run_tests,
+                    dataset_name,
+                    dataset_split,
+                    base_dir,
+                    repo,
+                    branch,
+                    test_dir,
+                    backend,
+                    timeout,
+                    stdout=False,
+                ): None
+                for repo, test_dir in pairs
+            }
+            # Wait for each future to complete
+            for future in as_completed(futures):
+                pbar.update(1)
+                try:
+                    # Update progress bar, check if instance ran successfully
+                    result = future.result()
+                    log_dirs.append(result)
+                except Exception:
+                    traceback.print_exc()
+                    continue
+
+    # get numbers
+    out = []
+    for name in tqdm(log_dirs):
+        report_file = os.path.join(name, "report.json")
+        name = name.split("/")[2]
+        if not os.path.exists(report_file):
+            out.append(
+                {
+                    "name": name,
+                    "sum": 0,
+                    "passed": 0,
+                    "num_passed": 0,
+                }
+            )
+            continue
+        report = load_dataset("json", data_files=report_file, split="train")  # type: ignore
+        test_ids = get_tests(name, stdout=False)
+        tests = {x["nodeid"]: x["call"] for x in report["tests"][0]}  # type: ignore
+        status = []
+        runtimes = []
+        no_runs = 0
+        for test_id in test_ids:
+            if test_id in tests and tests[test_id] is not None:
+                status.append(tests[test_id]["outcome"])
+                runtimes.append(tests[test_id]["duration"])
+                no_runs += 1
+            else:
+                status.append("failed")
+                runtimes.append(0)
+        status = Counter(status)
+        if no_runs == 0:
+            total = 0
+        else:
+            total = sum(runtimes)
+        if "xfail" not in status:
+            status["xfail"] = 0
+        passed = (status["passed"] + status["xfail"]) / sum(status.values())
+        out.append(
+            {
+                "name": name,
+                "sum": total,
+                "passed": passed,
+                "num_passed": status["passed"] + status["xfail"],
+                "num_tests": sum(status.values()),
+            }
+        )
+    print("repo,runtime,num_passed/num_tests")
+    out = sorted(out, key=lambda x: x["sum"], reverse=True)
+    for x in out:
+        print(f"{x['name']},{x['sum']},{x['num_passed']}/{x['num_tests']}")
+    total_runtime = sum([x["sum"] for x in out])
+    averaged_passed = sum([x["passed"] for x in out]) / len(out)
+    print(f"total runtime: {total_runtime}")
+    print(f"average pass rate: {averaged_passed}")
+
+
+__all__ = []
diff --git a/commit0/harness/get_pytest_ids.py b/commit0/harness/get_pytest_ids.py
@@ -1,16 +1,22 @@
 import tarfile
+from typing import List
 
 
-def main(repo: str) -> None:
+def main(repo: str, stdout: bool) -> List[str]:
     repo = repo.lower()
     repo = repo.replace(".", "-")
+    out = ""
     with tarfile.open(f"commit0/data/test_ids/{repo}.tar.bz2", "r:bz2") as tar:
         for member in tar.getmembers():
             if member.isfile():
                 file = tar.extractfile(member)
                 if file:
-                    content = file.read()
-                    print(content.decode("utf-8"))
+                    content = file.read().decode("utf-8")
+                    out += content
+                    if stdout:
+                        print(content)
+    out = out.split("\n")
+    return out
 
 
 __all__ = []
diff --git a/commit0/harness/run_pytest_ids.py b/commit0/harness/run_pytest_ids.py
@@ -41,7 +41,12 @@ class ExecutionBackend(StrEnum):
 
 
 def run_docker(
-    spec: Spec, logger: logging.Logger, eval_file: Path, timeout: int, log_dir: Path
+    spec: Spec,
+    logger: logging.Logger,
+    eval_file: Path,
+    timeout: int,
+    log_dir: Path,
+    stdout: bool,
 ) -> None:
     """Runs the tests in a local docker container.
 
@@ -76,7 +81,8 @@ def run_docker(
             output, "--json-report --json-report-file=report.json"
         )
         # stdout might be more straightforward
-        print(test_output)
+        if stdout:
+            print(test_output)
         test_output_path = log_dir / "test_output.txt"
         with open(test_output_path, "w") as f:
             f.write(test_output)
@@ -116,7 +122,12 @@ def run_docker(
 
 
 def run_modal(
-    spec: Spec, logger: logging.Logger, eval_file: Path, timeout: int, log_dir: Path
+    spec: Spec,
+    logger: logging.Logger,
+    eval_file: Path,
+    timeout: int,
+    log_dir: Path,
+    stdout: bool,
 ) -> None:
     """Runs the tests in a remote Modal container.
 
@@ -156,7 +167,32 @@ def run_modal(
         # TODO: add timeout
         print(output)
         print(error)
-        return
+
+        output = []
+        for line in process.stderr:
+            output.append(line)
+        output_s = "".join(line)
+        logger.info(output_s)
+        print(output_s)
+
+        timed_out = False
+        test_output = extract_test_output(
+            output_s, "--json-report --json-report-file=report.json"
+        )
+
+        # stdout might be more straightforward
+        if stdout:
+            print(test_output)
+        test_output_path = log_dir / "test_output.txt"
+        with open(test_output_path, "w") as f:
+            f.write(test_output)
+            if timed_out:
+                f.write(f"\n\nTimeout error: {timeout} seconds exceeded.")
+                raise EvaluationError(
+                    spec.repo,
+                    f"Test timed out after {timeout} seconds.",
+                    logger,
+                )
 
 
 def main(
@@ -168,7 +204,8 @@ def main(
     test_ids: str,
     backend: str,
     timeout: int,
-) -> None:
+    stdout: bool,
+) -> str:
     """Runs the pytests for repos in a dataset.
 
     Tests are run either locally through docker
@@ -186,7 +223,7 @@ def main(
 
     hashed_test_ids = get_hash_string(test_ids)
     # set up logging
-    log_dir = RUN_PYTEST_LOG_DIR / repo / hashed_test_ids
+    log_dir = RUN_PYTEST_LOG_DIR / repo / branch / hashed_test_ids
     log_dir.mkdir(parents=True, exist_ok=True)
     log_file = log_dir / "run_pytest.log"
     logger = setup_logger(repo, log_file)
@@ -262,9 +299,11 @@ def main(
 
     """
     if ExecutionBackend(backend) == ExecutionBackend.LOCAL:
-        run_docker(spec, logger, eval_file, timeout, log_dir)
+        run_docker(spec, logger, eval_file, timeout, log_dir, stdout)
     elif ExecutionBackend(backend) == ExecutionBackend.MODAL:
+        run_modal(spec, logger, eval_file, timeout, log_dir, stdout)
     """
+    return str(log_dir)
 
 
 __all__ = []
diff --git a/commit0/harness/save.py b/commit0/harness/save.py
diff --git a/commit0/harness/setup.py b/commit0/harness/setup.py
diff --git a/commit0/harness/spec.py b/commit0/harness/spec.py
diff --git a/commit0/harness/utils.py b/commit0/harness/utils.py