- drop the timeout.

JHU-CLSP · Jan 11, 2024 · 26e19af · 26e19af
1 parent adc2881
commit 26e19af
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 15 deletions.
diff --git a/src/evaluation_class.py b/src/evaluation_class.py
@@ -284,7 +284,7 @@ def load_task_names(self):
                 return subjective_test
             elif self.tasks == 'train':
                 # all tasks minue test and subjective test
-                return list(set(all_tasks) - set(test) - set(subjective_test))
+                return list(set(all_tasks) - set(test) - set(subjective_test) - set(test_hard))
             else:
                 raise Exception(f"{Fore.RED}Invalid setup: {self.tasks}")
 
@@ -534,16 +534,8 @@ def calculate_metrics(self, answers: List[str], input_type: str, baseline_answer
         print(f"{Fore.YELLOW}----> answers: `{answers}` - type: `{type(answers)}`")
         print(f"{Fore.YELLOW}----> baseline_answer: `{baseline_answer}` - type: `{type(baseline_answer)}`")
 
-        # normalize responses: turn "nan", or "{}" into empty string
-        for idx in range(len(answers)):
-            a = answers[idx]
-            if a in ["nan", "{}", "'{}'"] or (type(a) == float and np.isnan(a)):
-                answers[idx] = ""
-
-        logging.info(f"answers after mapping: `{answers}`")
-
         # handle empty
-        if answers == []:
+        if answers == [] or answers == [""]:
             if baseline_answer == "" or baseline_answer == [""] or \
                     baseline_answer == [] or baseline_answer == "[]" or baseline_answer == "['']":
                 score = 1.0
@@ -689,6 +681,26 @@ def score_outputs(self, inputs: List[Input], answers_map: Dict, task_results: Di
                 else:
                     i.values = ''
 
+            # if the input type is textbox and the gold text is empty, skip it.
+            # otherwise, we would be crediting the model for not filling many inputs that are not required.
+            if i.type in ['text', 'textarea', 'hidden']:
+
+                answers = answers_map[i.name]
+                # normalize responses: turn "nan", or "{}" into empty string
+                for idx in range(len(answers)):
+                    a = answers[idx]
+                    if a in ["nan", "{}", "'{}'"] or (type(a) == float and np.isnan(a)):
+                        answers[idx] = ""
+
+                print(f"answers after mapping: `{answers}`")
+
+                answers = clean_values(answers)
+                answers = list(set(answers))
+
+                if answers == [] or answers == [""]:
+                    continue
+
+
             # the score for this specific model input/output
             score_per_field = self.calculate_metrics(answers_map[i.name], i.type, i.values)
 
@@ -727,9 +739,14 @@ def enumerate_tasks(self, max_instance_count: int, **kwargs):
         for task_name in tqdm(tasks):
             print(f"{Fore.BLUE} = = = = = = = = = = = = starting new task: `{task_name}` = = = = = = = = = = = = ")
 
-            if self.filter_TAP_tasks(task_name) == False:
+            # skip, if starting with .
+            if task_name.startswith("."):
                 continue
 
+            # commenting this out since these tasks are not part of the evaluation
+            # if self.filter_TAP_tasks(task_name) == False:
+            #     continue
+
             instance_ids = self.task_ids[task_name]
             first_instance_id = min(instance_ids)
             print("First instance id:", first_instance_id)

diff --git a/tasks/Summarization (RLUE) 1/template.html b/tasks/Summarization (RLUE) 1/template.html
@@ -126,7 +126,7 @@ <h5><span class="key-term2">System&#39;s summary <u>(rate this!)</u>:</span></h5
 <br />
 Please take time to read the system&#39;s summary and to skim the article briefly --- then, rate the system&#39;s summary on the form below (appears in ~15s...). <i> For &quot;Summary quality&quot; --- please reference the article to check: 1) if the summary contains the key points; and 2) if the specific details mentioned in the summary are correct. </i>
 
-<div class="row align-content-center mt-5" id="coherence_slider_for_visible" style="visibility:hidden;">
+<div class="row align-content-center mt-5" id="coherence_slider_for_visible" >
 <div class="col-12 align-content-center">
 <div class="form-group"><label for="coherence" id="coherencelabel" style="font-size: 32px"><strong>Coherence/Fluency: 3/5</strong></label><br />
 <small>Is the system&#39;s generation <u>grammatical, easy-to-read, and well-written?</u> </small><br />
@@ -135,7 +135,7 @@ <h5><span class="key-term2">System&#39;s summary <u>(rate this!)</u>:</span></h5
 </div>
 </div>
 
-<div class="row align-content-center" id="quality_slider_for_visible" style="visibility:hidden;">
+<div class="row align-content-center" id="quality_slider_for_visible">
 <div class="col-12 align-content-center">
 <div class="form-group"><label for="quality" id="qualitylabel" style="font-size: 32px"><strong>Summary Quality: 3/5</strong></label><br />
 <small><em>Does the system&#39;s generation meaningfully capture the main points in the article? </em> </small><br />
@@ -145,14 +145,14 @@ <h5><span class="key-term2">System&#39;s summary <u>(rate this!)</u>:</span></h5
 </div>
 <!-- OPTIONAL FEEDBACK -->
 
-<div class="row mt-5" id="feedback_for_visible" style="visibility:hidden;">
+<div class="row mt-5" id="feedback_for_visible">
 <div class="col-8 offset-2 col-lg-6 offset-lg-3">
 <p>(Optional) Please let us know if anything was unclear, if you experienced any issues, or if you have any other feedback for us.</p>
 <textarea id="feedback" name="feedback" rows="3"></textarea></div>
 </div>
 <!-- SUBMIT BUTTON -->
 
-<div class="row mt-5" id="submit_for_visible" style="visibility:hidden;">
+<div class="row mt-5" id="submit_for_visible">
 <div class="col-2 offset-5"><input id="submitButton" onclick="getnext()" type="submit" value="Submit" /></div>
 </div>
 </form>