Significant-Gravitas · SilenNaihin · Jul 12, 2023 · Jul 11, 2023 · Jul 11, 2023 · Jul 12, 2023
diff --git a/agbenchmark/ReportManager.py b/agbenchmark/ReportManager.py
@@ -3,7 +3,9 @@
 import sys
 import time
 from datetime import datetime
-from typing import Any, Dict, Union
+from typing import Any, Dict
+
+from agbenchmark.utils import get_highest_success_difficulty
 
 
 class ReportManager:
@@ -23,7 +25,6 @@ def load(self) -> None:
                 if file_content:  # if file is not empty, load the json
                     data = json.loads(file_content)
                     self.tests = {k: data[k] for k in sorted(data)}
-                    data = self.replace_backslash(data)
                 else:  # if file is empty, assign an empty dictionary
                     self.tests = {}
         except FileNotFoundError:
@@ -36,8 +37,9 @@ def save(self) -> None:
         with open(self.filename, "w") as f:
             json.dump(self.tests, f, indent=4)
 
-    def add_test(self, test_name: str, test_details: dict) -> None:
+    def add_test(self, test_name: str, test_details: dict | list) -> None:
         self.tests[test_name] = test_details
+
         self.save()
 
     def remove_test(self, test_name: str) -> None:
@@ -50,19 +52,12 @@ def end_info_report(self, config: Dict[str, Any]) -> None:
         self.tests = {
             "command": command.split(os.sep)[-1],
             "completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"),
-            "time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds",
+            "metrics": {
+                "run_time": str(round(time.time() - self.start_time, 2)) + " seconds",
+                "highest_difficulty": get_highest_success_difficulty(self.tests),
+            },
             "tests": self.tests,
             "config": config,
         }
 
         self.save()
-
-    def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
-        if isinstance(value, str):
-            return value.replace("\\\\", "/")  # escape \ with \\
-        elif isinstance(value, list):
-            return [self.replace_backslash(i) for i in value]
-        elif isinstance(value, dict):
-            return {k: self.replace_backslash(v) for k, v in value.items()}
-        else:
-            return value
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
@@ -23,26 +23,10 @@ def run_agent(
     """Calling to get a response"""
 
     if MOCK_FLAG:
-        print("ITS A MOCK TEST", challenge_location)
         copy_artifacts_into_workspace(
             config["workspace"], "artifacts_out", challenge_location
         )
     else:
-        timeout = config["cutoff"]
-        print(
-            f"Running Python function '{config['entry_path']}' with timeout {timeout}"
-        )
-        command = [sys.executable, "-m", config["entry_path"], str(task)]
-        process = subprocess.Popen(
-            command,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            universal_newlines=True,
-            cwd=os.getcwd(),
-        )
-
-        start_time = time.time()
-
         print(
             f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}"
         )

diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json
@@ -13,6 +13,6 @@
   "info": {
     "difficulty": "basic",
     "description": "Tests ability for the agent to debug python code with a simple typo in it.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json
@@ -11,8 +11,8 @@
     "type": "execute_python_code"
   },
   "info": {
-    "difficulty": "medium",
+    "difficulty": "novice",
     "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json
@@ -11,7 +11,7 @@
     "type": "custom_python"
   },
   "info": {
-    "difficulty": "medium",
+    "difficulty": "advanced",
     "description": "Tests ability for the agent to build a simple web server locally",
     "side_effects": []
   }

diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
@@ -1,15 +1,52 @@
 import json
+from enum import Enum
 from pathlib import Path
 from typing import List, Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, validator
+
+
+class DifficultyLevel(Enum):
+    interface = "interface"
+    basic = "basic"
+    novice = "novice"
+    intermediate = "intermediate"
+    advanced = "advanced"
+    expert = "expert"
+    human = "human"
+
+
+# map from enum to difficulty level (numeric)
+DIFFICULTY_MAP = {
+    DifficultyLevel.interface: 1,
+    DifficultyLevel.basic: 2,
+    DifficultyLevel.novice: 3,
+    DifficultyLevel.intermediate: 4,
+    DifficultyLevel.advanced: 5,
+    DifficultyLevel.expert: 6,
+    DifficultyLevel.human: 7,
+}
 
 
 class Info(BaseModel):
-    difficulty: str
+    difficulty: DifficultyLevel
     description: str
     side_effects: List[str]
 
+    @validator("difficulty", pre=True)
+    def difficulty_to_enum(cls: "Info", v: str | DifficultyLevel) -> DifficultyLevel:
+        """Convert a string to an instance of DifficultyLevel."""
+        if isinstance(v, DifficultyLevel):
+            return v
+
+        if isinstance(v, str):
+            try:
+                return DifficultyLevel(v.lower())
+            except ValueError:
+                pass
+
+        raise ValueError(f"Cannot convert {v} to DifficultyLevel.")
+
 
 class Ground(BaseModel):
     answer: str

diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json
@@ -11,7 +11,7 @@
   },
   "info": {
     "description": "This reads the file quickly",
-    "difficulty": "basic",
+    "difficulty": "interface",
     "side_effects": [""]
   }
 }
diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json
@@ -11,7 +11,7 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "interface",
     "description": "Tests if an llm can search",
     "side_effects": [""]
   }

diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json
@@ -11,7 +11,7 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "interface",
     "description": "Tests the writing to file",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }

diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json
@@ -10,13 +10,9 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_memory_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json
@@ -11,8 +11,8 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "novice",
     "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json
@@ -11,8 +11,8 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "medium",
+    "difficulty": "intermediate",
     "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json
@@ -16,8 +16,8 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "medium",
+    "difficulty": "advanced",
     "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json
@@ -13,6 +13,6 @@
   "info": {
     "difficulty": "basic",
     "description": "Tests ability to retrieve information from a website.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json
@@ -11,7 +11,7 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "novice",
     "description": "Tests ability to retrieve information.",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }

diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json
@@ -27,7 +27,7 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "intermediate",
     "description": "Tests ability to retrieve information.",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }

diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
@@ -9,15 +9,10 @@
 from typing import Any, Dict
 
 import pytest
-from dotenv import load_dotenv
 
 from agbenchmark.challenge import Challenge
 from agbenchmark.start_benchmark import CURRENT_DIRECTORY
-
-load_dotenv()
-
-IMPROVE = os.getenv("IMPROVE", "False")
-
+from agbenchmark.utils import replace_backslash
 
 json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)
 
@@ -36,7 +31,11 @@ def get_test_path(json_file: str) -> str:
     # Create the path from "agbenchmark" onwards
     challenge_location = Path(*path.parts[agbenchmark_index:])
 
-    return str(challenge_location)
+    formatted_location = replace_backslash(str(challenge_location))
+    if isinstance(formatted_location, str):
+        return formatted_location
+    else:
+        return str(challenge_location)
 
 
 def generate_tests() -> None:
@@ -68,7 +67,7 @@ def test_method(self, config: Dict[str, Any]) -> None:  # type: ignore
                 )
                 sys.path.append(str(custom_python_location))
 
-                for (module_loader, name, ispkg) in pkgutil.iter_modules(
+                for module_loader, name, ispkg in pkgutil.iter_modules(
                     [str(custom_python_location)]
                 ):
                     module = importlib.import_module(name)