-
Notifications
You must be signed in to change notification settings - Fork 1
/
evalscript.py
120 lines (91 loc) · 4.71 KB
/
evalscript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import traceback
import time
import datetime
from common import ANSWERING_MODEL_NAME, EVALUATING_MODEL_NAME, query_text_simple, query_image_simple, callback_write, \
encode_image, set_api_key, is_visual_model
class Shared:
MASS_EVAL = True
def files_modified_last_hour(folder_path, m_name):
# Get the current time
now = datetime.datetime.now()
# Calculate the time one hour ago
one_hour_ago = now - datetime.timedelta(hours=1)
# List to store files modified in the last hour
modified_files = []
# Iterate through all files in the folder
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
if filename.startswith(m_name):
# Check if it's a file (not a directory)
if os.path.isfile(file_path):
# Get the last modified time of the file
file_mod_time = datetime.datetime.fromtimestamp(os.path.getmtime(file_path))
# Check if the file was modified within the last hour
if file_mod_time > one_hour_ago:
modified_files.append(filename)
return modified_files
def perform_evaluation(answering_model_name=None):
if answering_model_name is None:
answering_model_name = ANSWERING_MODEL_NAME
questions = [x for x in os.listdir("questions") if x.endswith(".txt") or x.endswith(".png")]
INCLUDE_EVALUATING_MNAME_IN_EVALUATION = False
while True:
missing = False
for q in questions:
m_name = answering_model_name.replace("/", "").replace(":", "")
e_m_name = EVALUATING_MODEL_NAME.replace("/", "").replace(":", "")
question_path = os.path.join("questions", q)
answer_path = m_name + "_" + q
answer_path = os.path.join("answers", answer_path)
answer_path = answer_path.replace(".png", ".txt")
if INCLUDE_EVALUATING_MNAME_IN_EVALUATION:
evaluation_path = m_name + "__" + e_m_name + "__" + q
else:
evaluation_path = m_name + "_" + q
if "gpt-4o" in e_m_name:
base_evaluation_path = "evaluation"
else:
base_evaluation_path = "evaluation-" + e_m_name
if not os.path.exists(base_evaluation_path):
os.mkdir(base_evaluation_path)
evaluation_path = os.path.join(base_evaluation_path, evaluation_path)
evaluation_path = evaluation_path.replace(".png", ".txt")
if os.path.exists(answer_path) and not os.path.exists(evaluation_path):
print("Evaluating:", answer_path)
answer = open(answer_path, "r").read()
if answer is not None and answer:
missing = True
try:
if question_path.endswith(".txt"):
question = open(question_path, "r", encoding="utf-8").read()
inquiry = ["Given the following question:\n\n"]
inquiry.append(question)
inquiry.append(
"\n\nHow would you grade the following answer from 1.0 (minimum) to 10.0 (maximum)? Please put the grade at the beginning of the response.\n\n")
inquiry.append(answer)
inquiry = ",".join(inquiry)
query_text_simple(None, evaluation_path, callback_write, question=inquiry)
elif is_visual_model(EVALUATING_MODEL_NAME):
base64_image = encode_image(question_path)
inquiry = [
"Given the attached image, how would you grade the following answer from 1.0 (minimum) to 10.0 (maximum)?\n\n"]
inquiry.append(answer)
inquiry = "".join(inquiry)
query_image_simple(None, evaluation_path, callback_write, base64_image=base64_image,
text=inquiry)
except:
traceback.print_exc()
last_hour_answers = files_modified_last_hour("answers", m_name)
last_hour_evaluations = files_modified_last_hour(base_evaluation_path, m_name)
if Shared.MASS_EVAL:
break_condition = (not missing) or (not last_hour_answers and not last_hour_evaluations)
else:
break_condition = (not missing) and (not last_hour_answers and not last_hour_evaluations)
if break_condition:
break
time.sleep(15)
set_api_key("evaluation")
if __name__ == "__main__":
Shared.MASS_EVAL = False
perform_evaluation()