diff --git a/agbenchmark/ReportManager.py b/agbenchmark/ReportManager.py index e6d8f62f63b..cae13595a70 100644 --- a/agbenchmark/ReportManager.py +++ b/agbenchmark/ReportManager.py @@ -3,7 +3,9 @@ import sys import time from datetime import datetime -from typing import Any, Dict, Union +from typing import Any, Dict + +from agbenchmark.utils import get_highest_success_difficulty class ReportManager: @@ -23,7 +25,6 @@ def load(self) -> None: if file_content: # if file is not empty, load the json data = json.loads(file_content) self.tests = {k: data[k] for k in sorted(data)} - data = self.replace_backslash(data) else: # if file is empty, assign an empty dictionary self.tests = {} except FileNotFoundError: @@ -36,8 +37,9 @@ def save(self) -> None: with open(self.filename, "w") as f: json.dump(self.tests, f, indent=4) - def add_test(self, test_name: str, test_details: dict) -> None: + def add_test(self, test_name: str, test_details: dict | list) -> None: self.tests[test_name] = test_details + self.save() def remove_test(self, test_name: str) -> None: @@ -50,19 +52,12 @@ def end_info_report(self, config: Dict[str, Any]) -> None: self.tests = { "command": command.split(os.sep)[-1], "completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"), - "time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds", + "metrics": { + "run_time": str(round(time.time() - self.start_time, 2)) + " seconds", + "highest_difficulty": get_highest_success_difficulty(self.tests), + }, "tests": self.tests, "config": config, } self.save() - - def replace_backslash(self, value: str) -> Union[str, list[str], dict]: - if isinstance(value, str): - return value.replace("\\\\", "/") # escape \ with \\ - elif isinstance(value, list): - return [self.replace_backslash(i) for i in value] - elif isinstance(value, dict): - return {k: self.replace_backslash(v) for k, v in value.items()} - else: - return value diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index a1a79ada0ac..991a7e8e0c2 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -23,26 +23,10 @@ def run_agent( """Calling to get a response""" if MOCK_FLAG: - print("ITS A MOCK TEST", challenge_location) copy_artifacts_into_workspace( config["workspace"], "artifacts_out", challenge_location ) else: - timeout = config["cutoff"] - print( - f"Running Python function '{config['entry_path']}' with timeout {timeout}" - ) - command = [sys.executable, "-m", config["entry_path"], str(task)] - process = subprocess.Popen( - command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - universal_newlines=True, - cwd=os.getcwd(), - ) - - start_time = time.time() - print( f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}" ) diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json index 0c724600050..061c924f52d 100644 --- a/agbenchmark/challenges/code/d1/data.json +++ b/agbenchmark/challenges/code/d1/data.json @@ -13,6 +13,6 @@ "info": { "difficulty": "basic", "description": "Tests ability for the agent to debug python code with a simple typo in it.", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json index 2923010949b..6523ef1d843 100644 --- a/agbenchmark/challenges/code/d2/data.json +++ b/agbenchmark/challenges/code/d2/data.json @@ -11,8 +11,8 @@ "type": "execute_python_code" }, "info": { - "difficulty": "medium", + "difficulty": "novice", "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json index 07d607f5fc7..94c81664c8d 100644 --- a/agbenchmark/challenges/code/d3/data.json +++ b/agbenchmark/challenges/code/d3/data.json @@ -11,7 +11,7 @@ "type": "custom_python" }, "info": { - "difficulty": "medium", + "difficulty": "advanced", "description": "Tests ability for the agent to build a simple web server locally", "side_effects": [] } diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index f4e3f2220ed..668025dd295 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -1,15 +1,52 @@ import json +from enum import Enum from pathlib import Path from typing import List, Optional -from pydantic import BaseModel +from pydantic import BaseModel, validator + + +class DifficultyLevel(Enum): + interface = "interface" + basic = "basic" + novice = "novice" + intermediate = "intermediate" + advanced = "advanced" + expert = "expert" + human = "human" + + +# map from enum to difficulty level (numeric) +DIFFICULTY_MAP = { + DifficultyLevel.interface: 1, + DifficultyLevel.basic: 2, + DifficultyLevel.novice: 3, + DifficultyLevel.intermediate: 4, + DifficultyLevel.advanced: 5, + DifficultyLevel.expert: 6, + DifficultyLevel.human: 7, +} class Info(BaseModel): - difficulty: str + difficulty: DifficultyLevel description: str side_effects: List[str] + @validator("difficulty", pre=True) + def difficulty_to_enum(cls: "Info", v: str | DifficultyLevel) -> DifficultyLevel: + """Convert a string to an instance of DifficultyLevel.""" + if isinstance(v, DifficultyLevel): + return v + + if isinstance(v, str): + try: + return DifficultyLevel(v.lower()) + except ValueError: + pass + + raise ValueError(f"Cannot convert {v} to DifficultyLevel.") + class Ground(BaseModel): answer: str diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json index c827581b6bf..1bf340454d8 100644 --- a/agbenchmark/challenges/interface/read_file/data.json +++ b/agbenchmark/challenges/interface/read_file/data.json @@ -11,7 +11,7 @@ }, "info": { "description": "This reads the file quickly", - "difficulty": "basic", + "difficulty": "interface", "side_effects": [""] } } diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json index f59b2dc9b53..de8934d9553 100644 --- a/agbenchmark/challenges/interface/search/data.json +++ b/agbenchmark/challenges/interface/search/data.json @@ -11,7 +11,7 @@ "type": "file" }, "info": { - "difficulty": "basic", + "difficulty": "interface", "description": "Tests if an llm can search", "side_effects": [""] } diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json index 2be2d0dfe8d..8db9cd620b4 100644 --- a/agbenchmark/challenges/interface/write_file/data.json +++ b/agbenchmark/challenges/interface/write_file/data.json @@ -11,7 +11,7 @@ "type": "file" }, "info": { - "difficulty": "basic", + "difficulty": "interface", "description": "Tests the writing to file", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json index 506b246ad02..ab86f1c3cfb 100644 --- a/agbenchmark/challenges/memory/m1/data.json +++ b/agbenchmark/challenges/memory/m1/data.json @@ -10,13 +10,9 @@ "files": [".txt"], "type": "file" }, - "mock": { - "mock_func": "basic_memory_mock", - "mock_task": "Follow the instructions in the instructions_1.txt file" - }, "info": { "difficulty": "basic", "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json index 7ef2552d1cc..9205c99f113 100644 --- a/agbenchmark/challenges/memory/m2/data.json +++ b/agbenchmark/challenges/memory/m2/data.json @@ -11,8 +11,8 @@ "type": "file" }, "info": { - "difficulty": "basic", + "difficulty": "novice", "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json index 720cce93c0c..3b78d614b44 100644 --- a/agbenchmark/challenges/memory/m3/data.json +++ b/agbenchmark/challenges/memory/m3/data.json @@ -11,8 +11,8 @@ "type": "file" }, "info": { - "difficulty": "medium", + "difficulty": "intermediate", "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json index 61965206bdb..84f5c2b21d1 100644 --- a/agbenchmark/challenges/memory/m4/data.json +++ b/agbenchmark/challenges/memory/m4/data.json @@ -16,8 +16,8 @@ "type": "file" }, "info": { - "difficulty": "medium", + "difficulty": "advanced", "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json index 4f3833dfc1c..e3e09302db6 100644 --- a/agbenchmark/challenges/retrieval/r1/data.json +++ b/agbenchmark/challenges/retrieval/r1/data.json @@ -13,6 +13,6 @@ "info": { "difficulty": "basic", "description": "Tests ability to retrieve information from a website.", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json index 5bc2e96b4a5..977be4bcddc 100644 --- a/agbenchmark/challenges/retrieval/r2/data.json +++ b/agbenchmark/challenges/retrieval/r2/data.json @@ -11,7 +11,7 @@ "type": "file" }, "info": { - "difficulty": "basic", + "difficulty": "novice", "description": "Tests ability to retrieve information.", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json index b918d3d4e81..5504908eaef 100644 --- a/agbenchmark/challenges/retrieval/r3/data.json +++ b/agbenchmark/challenges/retrieval/r3/data.json @@ -27,7 +27,7 @@ "type": "file" }, "info": { - "difficulty": "basic", + "difficulty": "intermediate", "description": "Tests ability to retrieve information.", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py index 00a6ed6351d..a5afef96c9a 100644 --- a/agbenchmark/challenges/test_all.py +++ b/agbenchmark/challenges/test_all.py @@ -9,15 +9,10 @@ from typing import Any, Dict import pytest -from dotenv import load_dotenv from agbenchmark.challenge import Challenge from agbenchmark.start_benchmark import CURRENT_DIRECTORY - -load_dotenv() - -IMPROVE = os.getenv("IMPROVE", "False") - +from agbenchmark.utils import replace_backslash json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True) @@ -36,7 +31,11 @@ def get_test_path(json_file: str) -> str: # Create the path from "agbenchmark" onwards challenge_location = Path(*path.parts[agbenchmark_index:]) - return str(challenge_location) + formatted_location = replace_backslash(str(challenge_location)) + if isinstance(formatted_location, str): + return formatted_location + else: + return str(challenge_location) def generate_tests() -> None: @@ -68,7 +67,7 @@ def test_method(self, config: Dict[str, Any]) -> None: # type: ignore ) sys.path.append(str(custom_python_location)) - for (module_loader, name, ispkg) in pkgutil.iter_modules( + for module_loader, name, ispkg in pkgutil.iter_modules( [str(custom_python_location)] ): module = importlib.import_module(name) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 87fdc9c104a..b91b5f9f8fc 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -1,6 +1,8 @@ import json import os import shutil +import sys +import time from pathlib import Path # noqa from typing import Any, Dict, Generator @@ -13,6 +15,7 @@ REGRESSION_TESTS_PATH, get_regression_data, ) +from agbenchmark.utils import calculate_success_percentage def resolve_workspace(config: Dict[str, Any]) -> str: @@ -107,9 +110,29 @@ def challenge_data(request: Any) -> None: return request.param +@pytest.fixture(autouse=True, scope="session") +def mock(request: Any) -> None: + return request.config.getoption("--mock") + + +@pytest.fixture(autouse=True, scope="function") +def timer(request: Any) -> Any: + start_time = time.time() + yield + run_time = time.time() - start_time + request.node.user_properties.append(("run_time", run_time)) + + +# tests that consistently pass are considered regression tests regression_manager = ReportManager(REGRESSION_TESTS_PATH) + +# user facing reporting information info_manager = ReportManager(INFO_TESTS_PATH) +INTERNAL_LOGS = Path(__file__).resolve().parent # agbenchmark/conftest.py +# internal db step in replacement track pass/fail rate +internal_info = ReportManager(str(INTERNAL_LOGS / "internal_info.json")) + def pytest_runtest_makereport(item: Any, call: Any) -> None: if call.when == "call": @@ -122,23 +145,66 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: ) # Extract the challenge_location from the class challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "") + test_name = item.nodeid.split("::")[1] + item.test_name = test_name test_details = { "difficulty": difficulty, "dependencies": dependencies, - "test": challenge_location, + "data_path": challenge_location, + } + + info_details: Any = { + "data_path": challenge_location, + "is_regression": False, + "metrics": { + "difficulty": difficulty, + "success": False, + }, } - print("pytest_runtest_makereport", test_details) + mock = "--mock" in sys.argv # Check if --mock is in sys.argv + if call.excinfo is None: - regression_manager.add_test(item.nodeid.split("::")[1], test_details) - test_details["success"] = True + info_details["metrics"]["success"] = True else: - regression_manager.remove_test(item.nodeid.split("::")[1]) - test_details["success"] = False - test_details["fail_reason"] = str(call.excinfo.value) + if not mock: # don't remove if it's a mock test + regression_manager.remove_test(test_name) + info_details["metrics"]["fail_reason"] = str(call.excinfo.value) + + prev_test_results: list[bool] = [] + + if not mock: + # only add if it's an actual test + prev_test_results = internal_info.tests.get(test_name, []) + prev_test_results.append(info_details["metrics"]["success"]) + internal_info.add_test(test_name, prev_test_results) + + # can calculate success rate regardless of mock + info_details["metrics"]["success_%"] = calculate_success_percentage( + prev_test_results + ) + + if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]: + # if the last 3 tests were successful, add to the regression tests + info_details["is_regression"] = True + regression_manager.add_test(test_name, test_details) + + # user facing reporting + item.info_details = info_details + if call.when == "teardown": + run_time = dict(item.user_properties).get("run_time") + + info_details = getattr(item, "info_details", {}) + test_name = getattr(item, "test_name", "") + + if info_details and test_name: + if run_time: + info_details["metrics"][ + "run_time" + ] = f"{str(round(run_time, 3))} seconds" - info_manager.add_test(item.nodeid.split("::")[1], test_details) + info_manager.add_test(test_name, info_details) def pytest_sessionfinish(session: Any) -> None: @@ -146,6 +212,7 @@ def pytest_sessionfinish(session: Any) -> None: with open(CONFIG_PATH, "r") as f: config = json.load(f) + internal_info.save() info_manager.end_info_report(config) regression_manager.save() diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json new file mode 100644 index 00000000000..5f46bd854ae --- /dev/null +++ b/agbenchmark/internal_info.json @@ -0,0 +1,67 @@ +{ + "TestBasicMemory": [ + true, + true, + true + ], + "TestBasicRetrieval": [ + true, + true, + true + ], + "TestCreateSimpleWebServer": [ + false, + false, + false + ], + "TestDebugSimpleTypoWithGuidance": [ + false, + false, + false + ], + "TestDebugSimpleTypoWithoutGuidance": [ + false, + false, + false + ], + "TestReadFile": [ + true, + true, + true + ], + "TestRememberMultipleIds": [ + true, + true, + true + ], + "TestRememberMultipleIdsWithNoise": [ + true, + true, + true + ], + "TestRememberMultiplePhrasesWithNoise": [ + true, + true, + true + ], + "TestRetrieval2": [ + true, + true, + true + ], + "TestRetrieval3": [ + true, + true, + true + ], + "TestSearch": [ + true, + true, + true + ], + "TestWriteFile": [ + true, + true, + true + ] +} \ No newline at end of file diff --git a/agbenchmark/regression_tests.json b/agbenchmark/regression_tests.json index 68632a127d7..ce73ce263a5 100644 --- a/agbenchmark/regression_tests.json +++ b/agbenchmark/regression_tests.json @@ -1,20 +1,11 @@ { - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile" - ], - "test": "agbenchmark/challenges/interface/read_file", - "success": true - }, "TestBasicMemory": { "difficulty": "basic", "dependencies": [ "TestReadFile", "TestWriteFile" ], - "test": "agbenchmark/challenges/memory/m1", - "success": true + "data_path": "agbenchmark/challenges/memory/m1" }, "TestBasicRetrieval": { "difficulty": "basic", @@ -22,78 +13,60 @@ "TestWriteFile", "TestSearch" ], - "test": "agbenchmark/challenges/retrieval/r1", - "success": true + "data_path": "agbenchmark/challenges/retrieval/r1" }, - "TestRememberMultipleIds": { + "TestReadFile": { "difficulty": "basic", "dependencies": [ - "TestBasicMemory" + "TestWriteFile" ], - "test": "agbenchmark/challenges/memory/m2", - "success": true + "data_path": "agbenchmark/challenges/interface/read_file" }, - "TestRetrieval2": { + "TestRememberMultipleIds": { "difficulty": "basic", "dependencies": [ - "TestBasicRetrieval" + "TestBasicMemory" ], - "test": "agbenchmark/challenges/retrieval/r2", - "success": true + "data_path": "agbenchmark/challenges/memory/m2" }, "TestRememberMultipleIdsWithNoise": { "difficulty": "medium", "dependencies": [ "TestRememberMultipleIds" ], - "test": "agbenchmark/challenges/memory/m3", - "success": true - }, - "TestRetrieval3": { - "difficulty": "basic", - "dependencies": [ - "TestRetrieval2" - ], - "test": "agbenchmark/challenges/retrieval/r3", - "success": true + "data_path": "agbenchmark/challenges/memory/m3" }, "TestRememberMultiplePhrasesWithNoise": { "difficulty": "medium", "dependencies": [ "TestRememberMultipleIdsWithNoise" ], - "test": "agbenchmark/challenges/memory/m4", - "success": true + "data_path": "agbenchmark/challenges/memory/m4" }, - "TestSearch": { + "TestRetrieval2": { "difficulty": "basic", "dependencies": [ - "TestWriteFile" + "TestBasicRetrieval" ], - "test": "agbenchmark/challenges/interface/search", - "success": true + "data_path": "agbenchmark/challenges/retrieval/r2" }, - "TestWriteFile": { + "TestRetrieval3": { "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/interface/write_file", - "success": true + "dependencies": [ + "TestRetrieval2" + ], + "data_path": "agbenchmark/challenges/retrieval/r3" }, - "TestDebugSimpleTypoWithGuidance": { + "TestSearch": { "difficulty": "basic", "dependencies": [ - "TestReadFile", "TestWriteFile" ], - "test": "agbenchmark/challenges/code/d1", - "success": true + "data_path": "agbenchmark/challenges/interface/search" }, - "TestDebugSimpleTypoWithoutGuidance": { - "difficulty": "medium", - "dependencies": [ - "TestDebugSimpleTypoWithGuidance" - ], - "test": "agbenchmark/challenges/code/d2", - "success": true + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "data_path": "agbenchmark/challenges/interface/write_file" } } \ No newline at end of file diff --git a/agbenchmark/reports/1.json b/agbenchmark/reports/1.json index df07fb878db..45945a3ee63 100644 --- a/agbenchmark/reports/1.json +++ b/agbenchmark/reports/1.json @@ -1,109 +1,148 @@ { "command": "agbenchmark start --mock", - "completion_time": "2023-07-10-21:19", - "time_elapsed": "8.75 seconds", + "completion_time": "2023-07-11-21:09", + "metrics": { + "run_time": "0.96 seconds", + "highest_difficulty": "advanced: 5" + }, "tests": { "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/interface/write_file", - "success": true + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 0, + "run_time": "0.008 seconds" + } }, "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile" - ], - "test": "agbenchmark/challenges/interface/read_file", - "success": true + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 0, + "run_time": "0.005 seconds" + } }, "TestSearch": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile" - ], - "test": "agbenchmark/challenges/interface/search", - "success": true + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 0, + "run_time": "0.006 seconds" + } }, "TestDebugSimpleTypoWithGuidance": { - "difficulty": "basic", - "dependencies": [ - "TestReadFile", - "TestWriteFile" - ], - "test": "agbenchmark/challenges/code/d1", - "success": true + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0, + "run_time": "0.489 seconds" + } }, "TestBasicMemory": { - "difficulty": "basic", - "dependencies": [ - "TestReadFile", - "TestWriteFile" - ], - "test": "agbenchmark/challenges/memory/m1", - "success": true + "data_path": "agbenchmark/challenges/memory/m1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 0, + "run_time": "0.02 seconds" + } }, "TestBasicRetrieval": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile", - "TestSearch" - ], - "test": "agbenchmark/challenges/retrieval/r1", - "success": true + "data_path": "agbenchmark/challenges/retrieval/r1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 0, + "run_time": "0.01 seconds" + } }, "TestDebugSimpleTypoWithoutGuidance": { - "difficulty": "medium", - "dependencies": [ - "TestDebugSimpleTypoWithGuidance" - ], - "test": "agbenchmark/challenges/code/d2", - "success": true + "data_path": "agbenchmark/challenges/code/d2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0, + "run_time": "0.001 seconds" + } + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d3", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0, + "run_time": "0.001 seconds" + } }, "TestRememberMultipleIds": { - "difficulty": "basic", - "dependencies": [ - "TestBasicMemory" - ], - "test": "agbenchmark/challenges/memory/m2", - "success": true + "data_path": "agbenchmark/challenges/memory/m2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 0, + "run_time": "0.018 seconds" + } }, "TestRetrieval2": { - "difficulty": "basic", - "dependencies": [ - "TestBasicRetrieval" - ], - "test": "agbenchmark/challenges/retrieval/r2", - "success": true + "data_path": "agbenchmark/challenges/retrieval/r2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 0, + "run_time": "0.009 seconds" + } }, "TestRememberMultipleIdsWithNoise": { - "difficulty": "medium", - "dependencies": [ - "TestRememberMultipleIds" - ], - "test": "agbenchmark/challenges/memory/m3", - "success": true + "data_path": "agbenchmark/challenges/memory/m3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": true, + "success_%": 0, + "run_time": "0.022 seconds" + } }, "TestRetrieval3": { - "difficulty": "basic", - "dependencies": [ - "TestRetrieval2" - ], - "test": "agbenchmark/challenges/retrieval/r3", - "success": true + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": true, + "success_%": 0, + "run_time": "0.01 seconds" + } }, "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", - "dependencies": [ - "TestRememberMultipleIdsWithNoise" - ], - "test": "agbenchmark/challenges/memory/m4", - "success": true + "data_path": "agbenchmark/challenges/memory/m4", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 0, + "run_time": "0.021 seconds" + } } }, "config": { "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark/benchmarks.py", + "entry_path": "agbenchmark.benchmarks", "cutoff": 60 } } \ No newline at end of file diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index ffde0c6d359..598113d3d7f 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -1,6 +1,10 @@ # radio charts, logs, helper functions for tests, anything else relevant. import glob +import re from pathlib import Path +from typing import Any + +from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyLevel def calculate_info_test_path(benchmarks_folder_path: Path) -> str: @@ -15,3 +19,54 @@ def calculate_info_test_path(benchmarks_folder_path: Path) -> str: run_name = f"{file_count + 1}.json" new_file_path = INFO_TESTS_PATH / run_name return str(new_file_path) + + +def replace_backslash(value: Any) -> Any: + if isinstance(value, str): + return re.sub( + r"\\+", "/", value + ) # replace one or more backslashes with a forward slash + elif isinstance(value, list): + return [replace_backslash(i) for i in value] + elif isinstance(value, dict): + return {k: replace_backslash(v) for k, v in value.items()} + else: + return value + + +def calculate_success_percentage(results: list[bool]) -> float: + success_count = results.count(True) + total_count = len(results) + if total_count == 0: + return 0 + success_percentage = (success_count / total_count) * 100 # as a percentage + return round(success_percentage, 2) + + +def get_highest_success_difficulty(data: dict) -> str: + highest_difficulty = None + highest_difficulty_level = -1 + + for test_name, test_data in data.items(): + if test_data["metrics"]["success"]: + # Replace 'medium' with 'intermediate' for this example + difficulty_str = test_data["metrics"]["difficulty"] + + try: + difficulty_enum = DifficultyLevel[difficulty_str.lower()] + difficulty_level = DIFFICULTY_MAP[difficulty_enum] + + if difficulty_level > highest_difficulty_level: + highest_difficulty = difficulty_enum + highest_difficulty_level = difficulty_level + except KeyError: + print( + f"Unexpected difficulty level '{difficulty_str}' in test '{test_name}'" + ) + + if highest_difficulty is not None: + highest_difficulty_str = highest_difficulty.name # convert enum to string + else: + highest_difficulty_str = "" + + return f"{highest_difficulty_str}: {highest_difficulty_level}" diff --git a/agent/SuperAGI b/agent/SuperAGI index 9280512910c..bd4b3def65e 160000 --- a/agent/SuperAGI +++ b/agent/SuperAGI @@ -1 +1 @@ -Subproject commit 9280512910c74bc33333e2ce7c48e47021227529 +Subproject commit bd4b3def65e964182b05bb9f7a350b00f55a6007 diff --git a/agent/gpt-engineer b/agent/gpt-engineer index 42400fd6797..cde9be3e732 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit 42400fd67972278e454621e7abf450a4f899a44a +Subproject commit cde9be3e73212b3d8366a4ed149a18122bfe2333 diff --git a/agent/mini-agi b/agent/mini-agi index 6a1d08880c6..08764876d9a 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 6a1d08880c65fe3e5831243c1e1ea19acf85516c +Subproject commit 08764876d9a5c84c9f9e879088854d2b9349d7a0 diff --git a/agent/smol-developer b/agent/smol-developer index a0e9f4f39e2..c52b14b1d5b 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit a0e9f4f39e26a56b13a364be09fc58d2d85150ea +Subproject commit c52b14b1d5b1b74d886f08d9914e7f43437f609d