Skip to content

Commit

Permalink
- drop the timeout.
Browse files Browse the repository at this point in the history
  • Loading branch information
danyaljj committed Jan 11, 2024
1 parent adc2881 commit 26e19af
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 15 deletions.
39 changes: 28 additions & 11 deletions src/evaluation_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def load_task_names(self):
return subjective_test
elif self.tasks == 'train':
# all tasks minue test and subjective test
return list(set(all_tasks) - set(test) - set(subjective_test))
return list(set(all_tasks) - set(test) - set(subjective_test) - set(test_hard))
else:
raise Exception(f"{Fore.RED}Invalid setup: {self.tasks}")

Expand Down Expand Up @@ -534,16 +534,8 @@ def calculate_metrics(self, answers: List[str], input_type: str, baseline_answer
print(f"{Fore.YELLOW}----> answers: `{answers}` - type: `{type(answers)}`")
print(f"{Fore.YELLOW}----> baseline_answer: `{baseline_answer}` - type: `{type(baseline_answer)}`")

# normalize responses: turn "nan", or "{}" into empty string
for idx in range(len(answers)):
a = answers[idx]
if a in ["nan", "{}", "'{}'"] or (type(a) == float and np.isnan(a)):
answers[idx] = ""

logging.info(f"answers after mapping: `{answers}`")

# handle empty
if answers == []:
if answers == [] or answers == [""]:
if baseline_answer == "" or baseline_answer == [""] or \
baseline_answer == [] or baseline_answer == "[]" or baseline_answer == "['']":
score = 1.0
Expand Down Expand Up @@ -689,6 +681,26 @@ def score_outputs(self, inputs: List[Input], answers_map: Dict, task_results: Di
else:
i.values = ''

# if the input type is textbox and the gold text is empty, skip it.
# otherwise, we would be crediting the model for not filling many inputs that are not required.
if i.type in ['text', 'textarea', 'hidden']:

answers = answers_map[i.name]
# normalize responses: turn "nan", or "{}" into empty string
for idx in range(len(answers)):
a = answers[idx]
if a in ["nan", "{}", "'{}'"] or (type(a) == float and np.isnan(a)):
answers[idx] = ""

print(f"answers after mapping: `{answers}`")

answers = clean_values(answers)
answers = list(set(answers))

if answers == [] or answers == [""]:
continue


# the score for this specific model input/output
score_per_field = self.calculate_metrics(answers_map[i.name], i.type, i.values)

Expand Down Expand Up @@ -727,9 +739,14 @@ def enumerate_tasks(self, max_instance_count: int, **kwargs):
for task_name in tqdm(tasks):
print(f"{Fore.BLUE} = = = = = = = = = = = = starting new task: `{task_name}` = = = = = = = = = = = = ")

if self.filter_TAP_tasks(task_name) == False:
# skip, if starting with .
if task_name.startswith("."):
continue

# commenting this out since these tasks are not part of the evaluation
# if self.filter_TAP_tasks(task_name) == False:
# continue

instance_ids = self.task_ids[task_name]
first_instance_id = min(instance_ids)
print("First instance id:", first_instance_id)
Expand Down
8 changes: 4 additions & 4 deletions tasks/Summarization (RLUE) 1/template.html
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ <h5><span class="key-term2">System&#39;s summary <u>(rate this!)</u>:</span></h5
<br />
Please take time to read the system&#39;s summary and to skim the article briefly --- then, rate the system&#39;s summary on the form below (appears in ~15s...). <i> For &quot;Summary quality&quot; --- please reference the article to check: 1) if the summary contains the key points; and 2) if the specific details mentioned in the summary are correct. </i>

<div class="row align-content-center mt-5" id="coherence_slider_for_visible" style="visibility:hidden;">
<div class="row align-content-center mt-5" id="coherence_slider_for_visible" >
<div class="col-12 align-content-center">
<div class="form-group"><label for="coherence" id="coherencelabel" style="font-size: 32px"><strong>Coherence/Fluency: 3/5</strong></label><br />
<small>Is the system&#39;s generation <u>grammatical, easy-to-read, and well-written?</u> </small><br />
Expand All @@ -135,7 +135,7 @@ <h5><span class="key-term2">System&#39;s summary <u>(rate this!)</u>:</span></h5
</div>
</div>

<div class="row align-content-center" id="quality_slider_for_visible" style="visibility:hidden;">
<div class="row align-content-center" id="quality_slider_for_visible">
<div class="col-12 align-content-center">
<div class="form-group"><label for="quality" id="qualitylabel" style="font-size: 32px"><strong>Summary Quality: 3/5</strong></label><br />
<small><em>Does the system&#39;s generation meaningfully capture the main points in the article? </em> </small><br />
Expand All @@ -145,14 +145,14 @@ <h5><span class="key-term2">System&#39;s summary <u>(rate this!)</u>:</span></h5
</div>
<!-- OPTIONAL FEEDBACK -->

<div class="row mt-5" id="feedback_for_visible" style="visibility:hidden;">
<div class="row mt-5" id="feedback_for_visible">
<div class="col-8 offset-2 col-lg-6 offset-lg-3">
<p>(Optional) Please let us know if anything was unclear, if you experienced any issues, or if you have any other feedback for us.</p>
<textarea id="feedback" name="feedback" rows="3"></textarea></div>
</div>
<!-- SUBMIT BUTTON -->

<div class="row mt-5" id="submit_for_visible" style="visibility:hidden;">
<div class="row mt-5" id="submit_for_visible">
<div class="col-2 offset-5"><input id="submitButton" onclick="getnext()" type="submit" value="Submit" /></div>
</div>
</form>
Expand Down

0 comments on commit 26e19af

Please sign in to comment.