Skip to content
This repository has been archived by the owner on Jun 9, 2024. It is now read-only.

fixing backslashes, adding basic metrics #89

Merged
merged 13 commits into from
Jul 12, 2023
23 changes: 9 additions & 14 deletions agbenchmark/ReportManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import sys
import time
from datetime import datetime
from typing import Any, Dict, Union
from typing import Any, Dict

from agbenchmark.utils import get_highest_success_difficulty


class ReportManager:
Expand All @@ -23,7 +25,6 @@ def load(self) -> None:
if file_content: # if file is not empty, load the json
data = json.loads(file_content)
self.tests = {k: data[k] for k in sorted(data)}
data = self.replace_backslash(data)
else: # if file is empty, assign an empty dictionary
self.tests = {}
except FileNotFoundError:
Expand All @@ -36,8 +37,9 @@ def save(self) -> None:
with open(self.filename, "w") as f:
json.dump(self.tests, f, indent=4)

def add_test(self, test_name: str, test_details: dict) -> None:
def add_test(self, test_name: str, test_details: dict | list) -> None:
self.tests[test_name] = test_details

self.save()

def remove_test(self, test_name: str) -> None:
Expand All @@ -50,19 +52,12 @@ def end_info_report(self, config: Dict[str, Any]) -> None:
self.tests = {
"command": command.split(os.sep)[-1],
"completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"),
"time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds",
"metrics": {
"run_time": str(round(time.time() - self.start_time, 2)) + " seconds",
"highest_difficulty": get_highest_success_difficulty(self.tests),
},
"tests": self.tests,
"config": config,
}

self.save()

def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
if isinstance(value, str):
return value.replace("\\\\", "/") # escape \ with \\
elif isinstance(value, list):
return [self.replace_backslash(i) for i in value]
elif isinstance(value, dict):
return {k: self.replace_backslash(v) for k, v in value.items()}
else:
return value
16 changes: 0 additions & 16 deletions agbenchmark/agent_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,10 @@ def run_agent(
"""Calling to get a response"""

if MOCK_FLAG:
print("ITS A MOCK TEST", challenge_location)
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", challenge_location
)
else:
timeout = config["cutoff"]
print(
f"Running Python function '{config['entry_path']}' with timeout {timeout}"
)
command = [sys.executable, "-m", config["entry_path"], str(task)]
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
cwd=os.getcwd(),
)

start_time = time.time()

print(
f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}"
)
Expand Down
2 changes: 1 addition & 1 deletion agbenchmark/challenges/code/d1/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@
"info": {
"difficulty": "basic",
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}
4 changes: 2 additions & 2 deletions agbenchmark/challenges/code/d2/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
"type": "execute_python_code"
},
"info": {
"difficulty": "medium",
"difficulty": "novice",
"description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}
2 changes: 1 addition & 1 deletion agbenchmark/challenges/code/d3/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"type": "custom_python"
},
"info": {
"difficulty": "medium",
"difficulty": "advanced",
"description": "Tests ability for the agent to build a simple web server locally",
"side_effects": []
}
Expand Down
41 changes: 39 additions & 2 deletions agbenchmark/challenges/define_task_types.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,52 @@
import json
from enum import Enum
from pathlib import Path
from typing import List, Optional

from pydantic import BaseModel
from pydantic import BaseModel, validator


class DifficultyLevel(Enum):
interface = "interface"
basic = "basic"
novice = "novice"
intermediate = "intermediate"
advanced = "advanced"
expert = "expert"
human = "human"


# map from enum to difficulty level (numeric)
DIFFICULTY_MAP = {
DifficultyLevel.interface: 1,
DifficultyLevel.basic: 2,
DifficultyLevel.novice: 3,
DifficultyLevel.intermediate: 4,
DifficultyLevel.advanced: 5,
DifficultyLevel.expert: 6,
DifficultyLevel.human: 7,
}


class Info(BaseModel):
difficulty: str
difficulty: DifficultyLevel
description: str
side_effects: List[str]

@validator("difficulty", pre=True)
def difficulty_to_enum(cls: "Info", v: str | DifficultyLevel) -> DifficultyLevel:
"""Convert a string to an instance of DifficultyLevel."""
if isinstance(v, DifficultyLevel):
return v

if isinstance(v, str):
try:
return DifficultyLevel(v.lower())
except ValueError:
pass

raise ValueError(f"Cannot convert {v} to DifficultyLevel.")


class Ground(BaseModel):
answer: str
Expand Down
2 changes: 1 addition & 1 deletion agbenchmark/challenges/interface/read_file/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
},
"info": {
"description": "This reads the file quickly",
"difficulty": "basic",
"difficulty": "interface",
"side_effects": [""]
}
}
2 changes: 1 addition & 1 deletion agbenchmark/challenges/interface/search/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"type": "file"
},
"info": {
"difficulty": "basic",
"difficulty": "interface",
"description": "Tests if an llm can search",
"side_effects": [""]
}
Expand Down
2 changes: 1 addition & 1 deletion agbenchmark/challenges/interface/write_file/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"type": "file"
},
"info": {
"difficulty": "basic",
"difficulty": "interface",
"description": "Tests the writing to file",
"side_effects": ["tests if there is in fact an LLM attached"]
}
Expand Down
6 changes: 1 addition & 5 deletions agbenchmark/challenges/memory/m1/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,9 @@
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_memory_mock",
"mock_task": "Follow the instructions in the instructions_1.txt file"
},
"info": {
"difficulty": "basic",
"description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}
4 changes: 2 additions & 2 deletions agbenchmark/challenges/memory/m2/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
"type": "file"
},
"info": {
"difficulty": "basic",
"difficulty": "novice",
"description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}
4 changes: 2 additions & 2 deletions agbenchmark/challenges/memory/m3/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
"type": "file"
},
"info": {
"difficulty": "medium",
"difficulty": "intermediate",
"description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}
4 changes: 2 additions & 2 deletions agbenchmark/challenges/memory/m4/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
"type": "file"
},
"info": {
"difficulty": "medium",
"difficulty": "advanced",
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}
2 changes: 1 addition & 1 deletion agbenchmark/challenges/retrieval/r1/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@
"info": {
"difficulty": "basic",
"description": "Tests ability to retrieve information from a website.",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}
2 changes: 1 addition & 1 deletion agbenchmark/challenges/retrieval/r2/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"type": "file"
},
"info": {
"difficulty": "basic",
"difficulty": "novice",
"description": "Tests ability to retrieve information.",
"side_effects": ["tests if there is in fact an LLM attached"]
}
Expand Down
2 changes: 1 addition & 1 deletion agbenchmark/challenges/retrieval/r3/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
"type": "file"
},
"info": {
"difficulty": "basic",
"difficulty": "intermediate",
"description": "Tests ability to retrieve information.",
"side_effects": ["tests if there is in fact an LLM attached"]
}
Expand Down
15 changes: 7 additions & 8 deletions agbenchmark/challenges/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,10 @@
from typing import Any, Dict

import pytest
from dotenv import load_dotenv

from agbenchmark.challenge import Challenge
from agbenchmark.start_benchmark import CURRENT_DIRECTORY

load_dotenv()

IMPROVE = os.getenv("IMPROVE", "False")

from agbenchmark.utils import replace_backslash

json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)

Expand All @@ -36,7 +31,11 @@ def get_test_path(json_file: str) -> str:
# Create the path from "agbenchmark" onwards
challenge_location = Path(*path.parts[agbenchmark_index:])

return str(challenge_location)
formatted_location = replace_backslash(str(challenge_location))
if isinstance(formatted_location, str):
return formatted_location
else:
return str(challenge_location)


def generate_tests() -> None:
Expand Down Expand Up @@ -68,7 +67,7 @@ def test_method(self, config: Dict[str, Any]) -> None: # type: ignore
)
sys.path.append(str(custom_python_location))

for (module_loader, name, ispkg) in pkgutil.iter_modules(
for module_loader, name, ispkg in pkgutil.iter_modules(
[str(custom_python_location)]
):
module = importlib.import_module(name)
Expand Down
Loading