diff --git a/src/4_run_evaluation.py b/src/4_run_evaluation.py index 6d71704e..0b812e66 100644 --- a/src/4_run_evaluation.py +++ b/src/4_run_evaluation.py @@ -77,8 +77,8 @@ use_relevant_html=args.use_relevant_html ) - eval.enumerate_tasks(max_instance_count) + # eval.enumerate_tasks(max_instance_count) # Debugging mode - # eval.enumerate_tasks(max_instance_count, task="ethics_sbic dialogue 2nd 0", first_instance_only=True) + eval.enumerate_tasks(max_instance_count, task="ethics_sbic dialogue 2nd 0", first_instance_only=True) # Collecting example code: python 4_run_evaluation.py --no-do_eval --headless > extract.txt # eval.enumerate_tasks(max_instance_count, task="ethics_sbic dialogue 2nd 0", first_instance_only=True, input_name="norm") diff --git a/src/evaluation_class.py b/src/evaluation_class.py index 2edc6164..a85f96e6 100644 --- a/src/evaluation_class.py +++ b/src/evaluation_class.py @@ -943,53 +943,53 @@ def enumerate_tasks(self, max_instance_count: int, **kwargs): elif self.solver_type == 'model': kwargs["scores"].append(score) - if self.do_eval: - # per-task statistics - per_task_score = per_task_score / len(instance_ids) - print(f"{Fore.MAGENTA}Task: {task_name} --> Score: {per_task_score}") - df = pd.DataFrame() - for task_name, inputs in results.items(): - all_scores = [] - for input_type, scores in inputs.items(): - avg_score = sum(scores) / len(scores) - all_scores.extend(scores) - df = pd.concat( - [ - df, pd.DataFrame({ - 'project': [task_name], - 'input_type': [input_type], - 'score': [avg_score] - }) - ], - ignore_index=True) - - - # add the overall score across all the inputs - df = pd.concat([ - df, pd.DataFrame({ + if self.do_eval: + # per-task statistics + per_task_score = per_task_score / len(instance_ids) + print(f"{Fore.MAGENTA}Task: {task_name} --> Score: {per_task_score}") + df = pd.DataFrame() + for task_name, inputs in results.items(): + all_scores = [] + for input_type, scores in inputs.items(): + avg_score = sum(scores) / len(scores) + all_scores.extend(scores) + df = pd.concat( + [ + df, pd.DataFrame({ 'project': [task_name], - 'input_type': ["all"], - 'score': [sum(all_scores) / len(all_scores)] - } - )], ignore_index=True - ) - - if 'project' not in df.columns: - df.insert(0, 'project', '') - if 'input_type' not in df.columns: - df.insert(1, 'input_type', '') - if 'score' not in df.columns: - df.insert(1, 'score', '') - - df = df.pivot(index='project', columns='input_type', values='score') - today = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - if self.solver_type == "text-vision" or self.solver_type == "gpt4-text-vision": - csv_filename = f'{self.solver_type}_{self.num_demonstrations}_use-relevant-html_{self.use_relevant_html}_{self.tasks}_scores_{today}.csv' - df.to_csv(csv_filename, index=True) - - # save results to json - with open(f'{self.solver_type}_scores_{today}.json', 'w') as f: - json.dump(results, f, indent=4) + 'input_type': [input_type], + 'score': [avg_score] + }) + ], + ignore_index=True) + + + # add the overall score across all the inputs + df = pd.concat([ + df, pd.DataFrame({ + 'project': [task_name], + 'input_type': ["all"], + 'score': [sum(all_scores) / len(all_scores)] + } + )], ignore_index=True + ) + + if 'project' not in df.columns: + df.insert(0, 'project', '') + if 'input_type' not in df.columns: + df.insert(1, 'input_type', '') + if 'score' not in df.columns: + df.insert(1, 'score', '') + + df = df.pivot(index='project', columns='input_type', values='score') + today = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + if self.solver_type == "text-vision" or self.solver_type == "gpt4-text-vision": + csv_filename = f'{self.solver_type}_{self.num_demonstrations}_use-relevant-html_{self.use_relevant_html}_{self.tasks}_scores_{today}.csv' + df.to_csv(csv_filename, index=True) + + # save results to json + with open(f'{self.solver_type}_scores_{today}.json', 'w') as f: + json.dump(results, f, indent=4) if self.dump_features: with open(f'{directory}/{task_name}.json', 'w') as f: diff --git a/src/text_gpt.txt b/src/text_gpt.txt deleted file mode 100644 index 6dc4b5af..00000000 --- a/src/text_gpt.txt +++ /dev/null @@ -1,142 +0,0 @@ -Solver: gpt4-text-vision -tasks ['ethics_sbic dialogue 2nd 0'] - = = = = = = = = = = = = starting new task: `ethics_sbic dialogue 2nd 0` = = = = = = = = = = = = -First instance id: 16430 -instance_id: 16430 <-> row_number: 0 - --> inputs: ['fix1', 'fix2', 'fix3', 'fix4', 'fix5', 'norm', 'socialnew', 'response', 'norm_fix', 'socialfix', 'certify-no-pii-response'] - --> Looking up gold labels from row index 0 of `input.csv` (unique inputs). - --> input labels: {'fix1': ['{}'], 'fix2': ["I'm shocked you would do this. Why would you do this?"], 'fix3': ['{}'], 'fix4': ['{}'], 'fix5': ['{}'], 'norm': ['s1'], 'socialnew': ['{}'], 'response': ["I'm sorry you had to put up with a customer like this. You shouldn't spill coffee on people though. It is hot and can really hurt them."], 'norm_fix': [''], 'socialfix': ['{}'], 'certify-no-pii-response': ['on']} - - - - - - - starting a new element: `Input(name=`fix1`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - - -input: Input(name=`fix1`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -about to try executing one action, on the following input: fix1 -Executing one action: self.actions.modify_text('fix1', 'I accidentally spilled coffee on a customer.') -Add text `I accidentally spilled coffee on a customer.` to this text input: - - - - - - - starting a new element: `Input(name=`fix2`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - - -input: Input(name=`fix2`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -about to try executing one action, on the following input: fix2 -Executing one action: self.actions.modify_text('fix2', 'I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?') -Add text `I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` to this text input: - - - - - - - starting a new element: `Input(name=`fix3`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - - -input: Input(name=`fix3`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -about to try executing one action, on the following input: fix3 -Executing one action: self.actions.modify_text('fix3', "I accidentally spilled coffee on a customer.") -Add text `I accidentally spilled coffee on a customer.` to this text input: - - - - - - - starting a new element: `Input(name=`fix4`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - - -input: Input(name=`fix4`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -about to try executing one action, on the following input: fix4 -Executing one action: self.actions.modify_text('fix4', 'I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?') -Add text `I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` to this text input: - - - - - - - starting a new element: `Input(name=`fix5`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - - -input: Input(name=`fix5`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -about to try executing one action, on the following input: fix5 -Executing one action: self.actions.modify_text('fix5', 'I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?') -Add text `I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` to this text input: - - - - - - - starting a new element: `Input(name=`norm`, type=`radio`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - - -input: Input(name=`norm`, type=`radio`, task=`ethics_sbic dialogue 2nd 0`) -about to try executing one action, on the following input: norm -Executing one action: self.actions.modify_radio('norm', 's6') -We are going to select this radio button: - - - - - - - starting a new element: `Input(name=`socialnew`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - - -input: Input(name=`socialnew`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -about to try executing one action, on the following input: socialnew -Executing one action: self.actions.modify_text('socialnew', 'It’s important to handle conflicts with customers professionally and calmly.') -Add text `It’s important to handle conflicts with customers professionally and calmly.` to this text input: - - - - - - - starting a new element: `Input(name=`response`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - - -input: Input(name=`response`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -about to try executing one action, on the following input: response -Executing one action: self.actions.modify_text('response', 'I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?') -Add text `I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` to this text input: - - - - - - - starting a new element: `Input(name=`norm_fix`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - - -input: Input(name=`norm_fix`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`) -about to try executing one action, on the following input: norm_fix -Executing one action: self.actions.modify_checkbox('norm_fix', 'on') -Looking for checkboxes with `name`: `norm_fix` the values: `['on']` - ** Warning **: input value is []. So, we're not going to modify the checkbox. - - - - - - - starting a new element: `Input(name=`socialfix`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - - -input: Input(name=`socialfix`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) -about to try executing one action, on the following input: socialfix -Executing one action: self.actions.modify_text('socialfix', 'It’s important to handle conflicts with customers professionally and calmly.') -Add text `It’s important to handle conflicts with customers professionally and calmly.` to this text input: - - - - - - - starting a new element: `Input(name=`certify-no-pii-response`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - - -input: Input(name=`certify-no-pii-response`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`) -about to try executing one action, on the following input: certify-no-pii-response -Executing one action: self.actions.modify_checkbox('certify-no-pii-response', True) -Looking for checkboxes with `name`: `certify-no-pii-response` the values: `['True']` - ** Warning **: input value is []. So, we're not going to modify the checkbox. -input: Input(name=`fix1`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) - visible : ['I accidentally spilled coffee on a customer.'] - visible_values : ['I accidentally spilled coffee on a customer.'] -input: Input(name=`fix2`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) - visible : ['I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?'] - visible_values : ['I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?'] -input: Input(name=`fix3`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) - visible : ['I accidentally spilled coffee on a customer.'] - visible_values : ['I accidentally spilled coffee on a customer.'] -input: Input(name=`fix4`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) - visible : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?'] - visible_values : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?'] -input: Input(name=`fix5`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) - visible : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?'] - visible_values : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?'] -input: Input(name=`norm`, type=`radio`, task=`ethics_sbic dialogue 2nd 0`) - visible : ['s6'] - visible_values : ['s6'] -input: Input(name=`socialnew`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) - visible : ['It’s important to handle conflicts with customers professionally and calmly.'] - visible_values : ['It’s important to handle conflicts with customers professionally and calmly.'] -input: Input(name=`response`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) - visible : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?'] - visible_values : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?'] -input: Input(name=`norm_fix`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`) - visible : [] - visible_values : [] -input: Input(name=`socialfix`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) - visible : ['It’s important to handle conflicts with customers professionally and calmly.'] - visible_values : ['It’s important to handle conflicts with customers professionally and calmly.'] -input: Input(name=`certify-no-pii-response`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`) - visible : ['on'] - visible_values : ['on'] - ------- evaluating input: Input(name=`fix1`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- -answers after mapping: `['']` - ------- evaluating input: Input(name=`fix2`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- -answers after mapping: `["I'm shocked you would do this. Why would you do this?"]` -----> answers: `["I'm shocked you would do this. Why would you do this?"]` - type: `` -----> baseline_answer: `I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` - type: `` - --> scores: 0.06666666666666667 -----> per-field score: 0.06666666666666667 - ------- evaluating input: Input(name=`fix3`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- -answers after mapping: `['']` - ------- evaluating input: Input(name=`fix4`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- -answers after mapping: `['']` - ------- evaluating input: Input(name=`fix5`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- -answers after mapping: `['']` - ------- evaluating input: Input(name=`norm`, type=`radio`, task=`ethics_sbic dialogue 2nd 0`) ------- -----> answers: `['s1']` - type: `` -----> baseline_answer: `s6` - type: `` ---> Computing the majority vote - --> scores: 0.0 -----> per-field score: 0.0 - ------- evaluating input: Input(name=`socialnew`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- -answers after mapping: `['']` - ------- evaluating input: Input(name=`response`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- -answers after mapping: `["I'm sorry you had to put up with a customer like this. You shouldn't spill coffee on people though. It is hot and can really hurt them."]` -----> answers: `["I'm sorry you had to put up with a customer like this. You shouldn't spill coffee on people though. It is hot and can really hurt them."]` - type: `` -----> baseline_answer: `I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` - type: `` - --> scores: 0.20408163265306123 -----> per-field score: 0.20408163265306123 - ------- evaluating input: Input(name=`norm_fix`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`) ------- -----> answers: `['']` - type: `` -----> baseline_answer: `` - type: `` -----> per-field score: 1.0 - ------- evaluating input: Input(name=`socialfix`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- -answers after mapping: `['']` - ------- evaluating input: Input(name=`certify-no-pii-response`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`) ------- -----> answers: `['on']` - type: `` -----> baseline_answer: `on` - type: `` -Model answers: on -Gold answers: ['on'] - --> scores: 1.0 -----> per-field score: 1.0 - --> Per-instance overall score: 0.45414965986394557 - --> Per-instance per-field breakdown: {'textarea': [0.06666666666666667, 0.20408163265306123], 'radio': [0.0], 'checkbox': [1.0, 1.0]} -Task: ethics_sbic dialogue 2nd 0 --> Score: 0.45414965986394557