From 437e066a66c4f3d6aeba26f79fe1c3d8e4ea5743 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Mon, 10 Jul 2023 17:46:03 -0700 Subject: [PATCH] Add "Simple web server" challenge (#74) Co-authored-by: Silen Naihin --- agbenchmark/RegressionManager.py | 15 +++- .../code/d3/custom_python/api_tests.py | 27 +++++++ agbenchmark/challenges/code/d3/data.json | 18 +++++ agbenchmark/challenges/test_all.py | 25 ++++++- poetry.lock | 31 +++++++- pyproject.toml | 1 + regression_tests.json | 73 ++++++++++++------- 7 files changed, 160 insertions(+), 30 deletions(-) create mode 100644 agbenchmark/challenges/code/d3/custom_python/api_tests.py create mode 100644 agbenchmark/challenges/code/d3/data.json diff --git a/agbenchmark/RegressionManager.py b/agbenchmark/RegressionManager.py index e289a4787802..ac9efc69693e 100644 --- a/agbenchmark/RegressionManager.py +++ b/agbenchmark/RegressionManager.py @@ -1,4 +1,5 @@ import json +from typing import Union class RegressionManager: @@ -15,7 +16,9 @@ def load(self) -> None: f.read().strip() ) # read the content and remove any leading/trailing whitespace if file_content: # if file is not empty, load the json - self.tests = json.loads(file_content) + data = json.loads(file_content) + self.tests = {k: data[k] for k in sorted(data)} + data = self.replace_backslash(data) else: # if file is empty, assign an empty dictionary self.tests = {} except FileNotFoundError: @@ -36,3 +39,13 @@ def remove_test(self, test_name: str) -> None: if test_name in self.tests: del self.tests[test_name] self.save() + + def replace_backslash(self, value: str) -> Union[str, list[str], dict]: + if isinstance(value, str): + return value.replace("\\\\", "/") # escape \ with \\ + elif isinstance(value, list): + return [self.replace_backslash(i) for i in value] + elif isinstance(value, dict): + return {k: self.replace_backslash(v) for k, v in value.items()} + else: + return value diff --git a/agbenchmark/challenges/code/d3/custom_python/api_tests.py b/agbenchmark/challenges/code/d3/custom_python/api_tests.py new file mode 100644 index 000000000000..1d6255ebd155 --- /dev/null +++ b/agbenchmark/challenges/code/d3/custom_python/api_tests.py @@ -0,0 +1,27 @@ +import os +from typing import Any, Dict +from unittest.mock import Mock, patch + +import requests + + +def make_assertion() -> None: + if os.environ.get("MOCK_TEST", "False").lower() == "true": + mock_response = Mock(requests.Response) + mock_response.status_code = 200 + mock_response.json.return_value = {"status": "OK"} + + with patch("requests.get", return_value=mock_response): + make_request_and_assert() + else: + make_request_and_assert() + + +def make_request_and_assert() -> Dict[str, Any]: + response = requests.get("http://localhost:8079/health") + if response.status_code != 200: + raise AssertionError( + f"Expected status code 200, but got {response.status_code}" + ) + + return response.json() diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json new file mode 100644 index 000000000000..07d607f5fc73 --- /dev/null +++ b/agbenchmark/challenges/code/d3/data.json @@ -0,0 +1,18 @@ +{ + "name": "TestCreateSimpleWebServer", + "category": ["code"], + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "dependencies": ["TestDebugSimpleTypoWithGuidance"], + "ground": { + "answer": "GET localhost:8079/health responds with a 200 OK", + "should_contain": [], + "should_not_contain": [], + "files": [], + "type": "custom_python" + }, + "info": { + "difficulty": "medium", + "description": "Tests ability for the agent to build a simple web server locally", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py index e7fe99e738e7..7dee0b2ab82e 100644 --- a/agbenchmark/challenges/test_all.py +++ b/agbenchmark/challenges/test_all.py @@ -2,6 +2,8 @@ import importlib import json import os +import pkgutil +import sys import types from pathlib import Path from typing import Any, Dict @@ -47,6 +49,19 @@ def generate_tests() -> None: class_name = data.get("name", "") challenge_location = get_test_path(json_file) + if data["ground"]["type"] == "custom_python": + custom_python_location = ( + f"{CURRENT_DIRECTORY}/../{challenge_location}/custom_python" + ) + sys.path.append(str(custom_python_location)) + + for (module_loader, name, ispkg) in pkgutil.iter_modules( + [str(custom_python_location)] + ): + module = importlib.import_module(name) + + if hasattr(module, "make_assertion"): + make_assertion = getattr(module, "make_assertion") # Define test class dynamically challenge_class = types.new_class(class_name, (Challenge,)) @@ -58,7 +73,15 @@ def test_method(self, config: Dict[str, Any]) -> None: # type: ignore self.setup_challenge(config) scores = self.get_scores(config) - assert 1 in scores + + # Check if make_assertion is defined and use it + if "make_assertion" in locals(): + try: + make_assertion() + except AssertionError as error: + print(error) # Or handle this in another way + else: + assert 1 in scores # Parametrize the method here test_method = pytest.mark.parametrize( diff --git a/poetry.lock b/poetry.lock index 4eae340b677a..5526da16b9cc 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "aiohttp" @@ -961,6 +961,33 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] +[[package]] +name = "types-requests" +version = "2.31.0.1" +description = "Typing stubs for requests" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "types-requests-2.31.0.1.tar.gz", hash = "sha256:3de667cffa123ce698591de0ad7db034a5317457a596eb0b4944e5a9d9e8d1ac"}, + {file = "types_requests-2.31.0.1-py3-none-any.whl", hash = "sha256:afb06ef8f25ba83d59a1d424bd7a5a939082f94b94e90ab5e6116bd2559deaa3"}, +] + +[package.dependencies] +types-urllib3 = "*" + +[[package]] +name = "types-urllib3" +version = "1.26.25.13" +description = "Typing stubs for urllib3" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "types-urllib3-1.26.25.13.tar.gz", hash = "sha256:3300538c9dc11dad32eae4827ac313f5d986b8b21494801f1bf97a1ac6c03ae5"}, + {file = "types_urllib3-1.26.25.13-py3-none-any.whl", hash = "sha256:5dbd1d2bef14efee43f5318b5d36d805a489f6600252bb53626d4bfafd95e27c"}, +] + [[package]] name = "typing-extensions" version = "4.7.1" @@ -1082,4 +1109,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "44b5789494e73f3cb8bcb9d25daa62143e59352a246fd7724fdb3ad58c2560ae" +content-hash = "81b84bbe08d4a09fb6a4f99c7fb018e0c0fcd879fa368c388b0af20c7c9a3f31" diff --git a/pyproject.toml b/pyproject.toml index a8f4f8dee5f6..1a96a51de188 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ pydantic = "^1.10.9" pytest-depends = "^1.0.1" python-dotenv = "^0.21.0" click = "^8.1.3" +types-requests = "^2.31.0.1" [tool.poetry.group.dev.dependencies] flake8 = "^3.9.2" diff --git a/regression_tests.json b/regression_tests.json index 10a6e11bf64a..0cf2d5f30ae3 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -1,69 +1,90 @@ { - "TestWriteFile": { + "TestBasicMemory": { "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark\\challenges\\interface\\write_file" + "dependencies": [ + "TestReadFile", + "TestWriteFile" + ], + "test": "agbenchmark/challenges/memory/m1" }, - "TestReadFile": { + "TestBasicRetrieval": { "difficulty": "basic", "dependencies": [ - "TestWriteFile" + "TestWriteFile", + "TestSearch" ], - "test": "agbenchmark\\challenges\\interface\\read_file" + "test": "agbenchmark/challenges/retrieval/r1" }, - "TestBasicMemory": { + "TestCreateSimpleWebServer": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/code/d3" + }, + "TestDebugSimpleTypoWithGuidance": { "difficulty": "basic", "dependencies": [ "TestReadFile", "TestWriteFile" ], - "test": "agbenchmark\\challenges\\memory\\m1" + "test": "agbenchmark/challenges/code/d1" }, - "TestBasicRetrieval": { + "TestDebugSimpleTypoWithoutGuidance": { + "difficulty": "medium", + "dependencies": [ + "TestDebugSimpleTypoWithGuidance" + ], + "test": "agbenchmark/challenges/code/d2" + }, + "TestReadFile": { "difficulty": "basic", "dependencies": [ "TestWriteFile" ], - "test": "agbenchmark\\challenges\\retrieval\\r1" + "test": "agbenchmark/challenges/interface/read_file" }, "TestRememberMultipleIds": { "difficulty": "basic", "dependencies": [ "TestBasicMemory" ], - "test": "agbenchmark\\challenges\\memory\\m2" + "test": "agbenchmark/challenges/memory/m2" }, - "TestRetrieval2": { - "difficulty": "basic", + "TestRememberMultipleIdsWithNoise": { + "difficulty": "medium", "dependencies": [ - "TestBasicRetrieval" + "TestRememberMultipleIds" ], - "test": "agbenchmark\\challenges\\retrieval\\r2" + "test": "agbenchmark/challenges/memory/m3" }, - "TestRememberMultipleIdsWithNoise": { + "TestRememberMultiplePhrasesWithNoise": { "difficulty": "medium", "dependencies": [ - "TestRememberMultipleIds" + "TestRememberMultipleIdsWithNoise" ], - "test": "agbenchmark\\challenges\\memory\\m3" + "test": "agbenchmark/challenges/memory/m4" }, - "TestRetrieval3": { + "TestRetrieval2": { "difficulty": "basic", "dependencies": [ - "TestRetrieval2" + "TestBasicRetrieval" ], - "test": "agbenchmark\\challenges\\retrieval\\r3" + "test": "agbenchmark/challenges/retrieval/r2" }, - "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", + "TestRetrieval3": { + "difficulty": "basic", "dependencies": [ - "TestRememberMultipleIdsWithNoise" + "TestRetrieval2" ], - "test": "agbenchmark\\challenges\\memory\\m4" + "test": "agbenchmark/challenges/retrieval/r3" }, "TestSearch": { "difficulty": "basic", "dependencies": [], - "test": "agbenchmark\\challenges\\interface\\search" + "test": "agbenchmark/challenges/interface/search" + }, + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/interface/write_file" } } \ No newline at end of file