diff --git a/src/4_run_evaluation.py b/src/4_run_evaluation.py
index 6d71704e..0b812e66 100644
--- a/src/4_run_evaluation.py
+++ b/src/4_run_evaluation.py
@@ -77,8 +77,8 @@
use_relevant_html=args.use_relevant_html
)
- eval.enumerate_tasks(max_instance_count)
+ # eval.enumerate_tasks(max_instance_count)
# Debugging mode
- # eval.enumerate_tasks(max_instance_count, task="ethics_sbic dialogue 2nd 0", first_instance_only=True)
+ eval.enumerate_tasks(max_instance_count, task="ethics_sbic dialogue 2nd 0", first_instance_only=True)
# Collecting example code: python 4_run_evaluation.py --no-do_eval --headless > extract.txt
# eval.enumerate_tasks(max_instance_count, task="ethics_sbic dialogue 2nd 0", first_instance_only=True, input_name="norm")
diff --git a/src/evaluation_class.py b/src/evaluation_class.py
index 2edc6164..a85f96e6 100644
--- a/src/evaluation_class.py
+++ b/src/evaluation_class.py
@@ -943,53 +943,53 @@ def enumerate_tasks(self, max_instance_count: int, **kwargs):
elif self.solver_type == 'model':
kwargs["scores"].append(score)
- if self.do_eval:
- # per-task statistics
- per_task_score = per_task_score / len(instance_ids)
- print(f"{Fore.MAGENTA}Task: {task_name} --> Score: {per_task_score}")
- df = pd.DataFrame()
- for task_name, inputs in results.items():
- all_scores = []
- for input_type, scores in inputs.items():
- avg_score = sum(scores) / len(scores)
- all_scores.extend(scores)
- df = pd.concat(
- [
- df, pd.DataFrame({
- 'project': [task_name],
- 'input_type': [input_type],
- 'score': [avg_score]
- })
- ],
- ignore_index=True)
-
-
- # add the overall score across all the inputs
- df = pd.concat([
- df, pd.DataFrame({
+ if self.do_eval:
+ # per-task statistics
+ per_task_score = per_task_score / len(instance_ids)
+ print(f"{Fore.MAGENTA}Task: {task_name} --> Score: {per_task_score}")
+ df = pd.DataFrame()
+ for task_name, inputs in results.items():
+ all_scores = []
+ for input_type, scores in inputs.items():
+ avg_score = sum(scores) / len(scores)
+ all_scores.extend(scores)
+ df = pd.concat(
+ [
+ df, pd.DataFrame({
'project': [task_name],
- 'input_type': ["all"],
- 'score': [sum(all_scores) / len(all_scores)]
- }
- )], ignore_index=True
- )
-
- if 'project' not in df.columns:
- df.insert(0, 'project', '')
- if 'input_type' not in df.columns:
- df.insert(1, 'input_type', '')
- if 'score' not in df.columns:
- df.insert(1, 'score', '')
-
- df = df.pivot(index='project', columns='input_type', values='score')
- today = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
- if self.solver_type == "text-vision" or self.solver_type == "gpt4-text-vision":
- csv_filename = f'{self.solver_type}_{self.num_demonstrations}_use-relevant-html_{self.use_relevant_html}_{self.tasks}_scores_{today}.csv'
- df.to_csv(csv_filename, index=True)
-
- # save results to json
- with open(f'{self.solver_type}_scores_{today}.json', 'w') as f:
- json.dump(results, f, indent=4)
+ 'input_type': [input_type],
+ 'score': [avg_score]
+ })
+ ],
+ ignore_index=True)
+
+
+ # add the overall score across all the inputs
+ df = pd.concat([
+ df, pd.DataFrame({
+ 'project': [task_name],
+ 'input_type': ["all"],
+ 'score': [sum(all_scores) / len(all_scores)]
+ }
+ )], ignore_index=True
+ )
+
+ if 'project' not in df.columns:
+ df.insert(0, 'project', '')
+ if 'input_type' not in df.columns:
+ df.insert(1, 'input_type', '')
+ if 'score' not in df.columns:
+ df.insert(1, 'score', '')
+
+ df = df.pivot(index='project', columns='input_type', values='score')
+ today = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+ if self.solver_type == "text-vision" or self.solver_type == "gpt4-text-vision":
+ csv_filename = f'{self.solver_type}_{self.num_demonstrations}_use-relevant-html_{self.use_relevant_html}_{self.tasks}_scores_{today}.csv'
+ df.to_csv(csv_filename, index=True)
+
+ # save results to json
+ with open(f'{self.solver_type}_scores_{today}.json', 'w') as f:
+ json.dump(results, f, indent=4)
if self.dump_features:
with open(f'{directory}/{task_name}.json', 'w') as f:
diff --git a/src/text_gpt.txt b/src/text_gpt.txt
deleted file mode 100644
index 6dc4b5af..00000000
--- a/src/text_gpt.txt
+++ /dev/null
@@ -1,142 +0,0 @@
-Solver: gpt4-text-vision
-tasks ['ethics_sbic dialogue 2nd 0']
- = = = = = = = = = = = = starting new task: `ethics_sbic dialogue 2nd 0` = = = = = = = = = = = =
-First instance id: 16430
-instance_id: 16430 <-> row_number: 0
- --> inputs: ['fix1', 'fix2', 'fix3', 'fix4', 'fix5', 'norm', 'socialnew', 'response', 'norm_fix', 'socialfix', 'certify-no-pii-response']
- --> Looking up gold labels from row index 0 of `input.csv` (unique inputs).
- --> input labels: {'fix1': ['{}'], 'fix2': ["I'm shocked you would do this. Why would you do this?"], 'fix3': ['{}'], 'fix4': ['{}'], 'fix5': ['{}'], 'norm': ['s1'], 'socialnew': ['{}'], 'response': ["I'm sorry you had to put up with a customer like this. You shouldn't spill coffee on people though. It is hot and can really hurt them."], 'norm_fix': [''], 'socialfix': ['{}'], 'certify-no-pii-response': ['on']}
- - - - - - - starting a new element: `Input(name=`fix1`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -
-input: Input(name=`fix1`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: fix1
-Executing one action: self.actions.modify_text('fix1', 'I accidentally spilled coffee on a customer.')
-Add text `I accidentally spilled coffee on a customer.` to this text input:
- - - - - - - starting a new element: `Input(name=`fix2`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -
-input: Input(name=`fix2`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: fix2
-Executing one action: self.actions.modify_text('fix2', 'I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?')
-Add text `I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` to this text input:
- - - - - - - starting a new element: `Input(name=`fix3`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -
-input: Input(name=`fix3`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: fix3
-Executing one action: self.actions.modify_text('fix3', "I accidentally spilled coffee on a customer.")
-Add text `I accidentally spilled coffee on a customer.` to this text input:
- - - - - - - starting a new element: `Input(name=`fix4`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -
-input: Input(name=`fix4`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: fix4
-Executing one action: self.actions.modify_text('fix4', 'I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?')
-Add text `I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` to this text input:
- - - - - - - starting a new element: `Input(name=`fix5`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -
-input: Input(name=`fix5`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: fix5
-Executing one action: self.actions.modify_text('fix5', 'I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?')
-Add text `I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` to this text input:
- - - - - - - starting a new element: `Input(name=`norm`, type=`radio`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -
-input: Input(name=`norm`, type=`radio`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: norm
-Executing one action: self.actions.modify_radio('norm', 's6')
-We are going to select this radio button:
- - - - - - - starting a new element: `Input(name=`socialnew`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -
-input: Input(name=`socialnew`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: socialnew
-Executing one action: self.actions.modify_text('socialnew', 'It’s important to handle conflicts with customers professionally and calmly.')
-Add text `It’s important to handle conflicts with customers professionally and calmly.` to this text input:
- - - - - - - starting a new element: `Input(name=`response`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -
-input: Input(name=`response`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: response
-Executing one action: self.actions.modify_text('response', 'I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?')
-Add text `I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` to this text input:
- - - - - - - starting a new element: `Input(name=`norm_fix`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -
-input: Input(name=`norm_fix`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: norm_fix
-Executing one action: self.actions.modify_checkbox('norm_fix', 'on')
-Looking for checkboxes with `name`: `norm_fix` the values: `['on']`
- ** Warning **: input value is []. So, we're not going to modify the checkbox.
- - - - - - - starting a new element: `Input(name=`socialfix`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -
-input: Input(name=`socialfix`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: socialfix
-Executing one action: self.actions.modify_text('socialfix', 'It’s important to handle conflicts with customers professionally and calmly.')
-Add text `It’s important to handle conflicts with customers professionally and calmly.` to this text input:
- - - - - - - starting a new element: `Input(name=`certify-no-pii-response`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -
-input: Input(name=`certify-no-pii-response`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: certify-no-pii-response
-Executing one action: self.actions.modify_checkbox('certify-no-pii-response', True)
-Looking for checkboxes with `name`: `certify-no-pii-response` the values: `['True']`
- ** Warning **: input value is []. So, we're not going to modify the checkbox.
-input: Input(name=`fix1`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['I accidentally spilled coffee on a customer.']
- visible_values : ['I accidentally spilled coffee on a customer.']
-input: Input(name=`fix2`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
- visible_values : ['I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
-input: Input(name=`fix3`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['I accidentally spilled coffee on a customer.']
- visible_values : ['I accidentally spilled coffee on a customer.']
-input: Input(name=`fix4`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
- visible_values : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
-input: Input(name=`fix5`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
- visible_values : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
-input: Input(name=`norm`, type=`radio`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['s6']
- visible_values : ['s6']
-input: Input(name=`socialnew`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['It’s important to handle conflicts with customers professionally and calmly.']
- visible_values : ['It’s important to handle conflicts with customers professionally and calmly.']
-input: Input(name=`response`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
- visible_values : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
-input: Input(name=`norm_fix`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)
- visible : []
- visible_values : []
-input: Input(name=`socialfix`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['It’s important to handle conflicts with customers professionally and calmly.']
- visible_values : ['It’s important to handle conflicts with customers professionally and calmly.']
-input: Input(name=`certify-no-pii-response`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['on']
- visible_values : ['on']
- ------- evaluating input: Input(name=`fix1`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -------
-answers after mapping: `['']`
- ------- evaluating input: Input(name=`fix2`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -------
-answers after mapping: `["I'm shocked you would do this. Why would you do this?"]`
-----> answers: `["I'm shocked you would do this. Why would you do this?"]` - type: ``
-----> baseline_answer: `I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` - type: ``
- --> scores: 0.06666666666666667
-----> per-field score: 0.06666666666666667
- ------- evaluating input: Input(name=`fix3`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -------
-answers after mapping: `['']`
- ------- evaluating input: Input(name=`fix4`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -------
-answers after mapping: `['']`
- ------- evaluating input: Input(name=`fix5`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -------
-answers after mapping: `['']`
- ------- evaluating input: Input(name=`norm`, type=`radio`, task=`ethics_sbic dialogue 2nd 0`) -------
-----> answers: `['s1']` - type: ``
-----> baseline_answer: `s6` - type: ``
---> Computing the majority vote
- --> scores: 0.0
-----> per-field score: 0.0
- ------- evaluating input: Input(name=`socialnew`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -------
-answers after mapping: `['']`
- ------- evaluating input: Input(name=`response`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -------
-answers after mapping: `["I'm sorry you had to put up with a customer like this. You shouldn't spill coffee on people though. It is hot and can really hurt them."]`
-----> answers: `["I'm sorry you had to put up with a customer like this. You shouldn't spill coffee on people though. It is hot and can really hurt them."]` - type: ``
-----> baseline_answer: `I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` - type: ``
- --> scores: 0.20408163265306123
-----> per-field score: 0.20408163265306123
- ------- evaluating input: Input(name=`norm_fix`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`) -------
-----> answers: `['']` - type: ``
-----> baseline_answer: `` - type: ``
-----> per-field score: 1.0
- ------- evaluating input: Input(name=`socialfix`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -------
-answers after mapping: `['']`
- ------- evaluating input: Input(name=`certify-no-pii-response`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`) -------
-----> answers: `['on']` - type: ``
-----> baseline_answer: `on` - type: ``
-Model answers: on
-Gold answers: ['on']
- --> scores: 1.0
-----> per-field score: 1.0
- --> Per-instance overall score: 0.45414965986394557
- --> Per-instance per-field breakdown: {'textarea': [0.06666666666666667, 0.20408163265306123], 'radio': [0.0], 'checkbox': [1.0, 1.0]}
-Task: ethics_sbic dialogue 2nd 0 --> Score: 0.45414965986394557