diff --git a/src/evaluation_class.py b/src/evaluation_class.py index 42e7d929..d0da176d 100644 --- a/src/evaluation_class.py +++ b/src/evaluation_class.py @@ -284,7 +284,7 @@ def load_task_names(self): return subjective_test elif self.tasks == 'train': # all tasks minue test and subjective test - return list(set(all_tasks) - set(test) - set(subjective_test)) + return list(set(all_tasks) - set(test) - set(subjective_test) - set(test_hard)) else: raise Exception(f"{Fore.RED}Invalid setup: {self.tasks}") @@ -534,16 +534,8 @@ def calculate_metrics(self, answers: List[str], input_type: str, baseline_answer print(f"{Fore.YELLOW}----> answers: `{answers}` - type: `{type(answers)}`") print(f"{Fore.YELLOW}----> baseline_answer: `{baseline_answer}` - type: `{type(baseline_answer)}`") - # normalize responses: turn "nan", or "{}" into empty string - for idx in range(len(answers)): - a = answers[idx] - if a in ["nan", "{}", "'{}'"] or (type(a) == float and np.isnan(a)): - answers[idx] = "" - - logging.info(f"answers after mapping: `{answers}`") - # handle empty - if answers == []: + if answers == [] or answers == [""]: if baseline_answer == "" or baseline_answer == [""] or \ baseline_answer == [] or baseline_answer == "[]" or baseline_answer == "['']": score = 1.0 @@ -689,6 +681,26 @@ def score_outputs(self, inputs: List[Input], answers_map: Dict, task_results: Di else: i.values = '' + # if the input type is textbox and the gold text is empty, skip it. + # otherwise, we would be crediting the model for not filling many inputs that are not required. + if i.type in ['text', 'textarea', 'hidden']: + + answers = answers_map[i.name] + # normalize responses: turn "nan", or "{}" into empty string + for idx in range(len(answers)): + a = answers[idx] + if a in ["nan", "{}", "'{}'"] or (type(a) == float and np.isnan(a)): + answers[idx] = "" + + print(f"answers after mapping: `{answers}`") + + answers = clean_values(answers) + answers = list(set(answers)) + + if answers == [] or answers == [""]: + continue + + # the score for this specific model input/output score_per_field = self.calculate_metrics(answers_map[i.name], i.type, i.values) @@ -727,9 +739,14 @@ def enumerate_tasks(self, max_instance_count: int, **kwargs): for task_name in tqdm(tasks): print(f"{Fore.BLUE} = = = = = = = = = = = = starting new task: `{task_name}` = = = = = = = = = = = = ") - if self.filter_TAP_tasks(task_name) == False: + # skip, if starting with . + if task_name.startswith("."): continue + # commenting this out since these tasks are not part of the evaluation + # if self.filter_TAP_tasks(task_name) == False: + # continue + instance_ids = self.task_ids[task_name] first_instance_id = min(instance_ids) print("First instance id:", first_instance_id) diff --git a/tasks/Summarization (RLUE) 1/template.html b/tasks/Summarization (RLUE) 1/template.html index ca3af7cf..c50b109e 100644 --- a/tasks/Summarization (RLUE) 1/template.html +++ b/tasks/Summarization (RLUE) 1/template.html @@ -126,7 +126,7 @@
System's summary (rate this!):
Please take time to read the system's summary and to skim the article briefly --- then, rate the system's summary on the form below (appears in ~15s...). For "Summary quality" --- please reference the article to check: 1) if the summary contains the key points; and 2) if the specific details mentioned in the summary are correct. -