Skip to content

Commit

Permalink
- merge with the main branch.
Browse files Browse the repository at this point in the history
  • Loading branch information
danyaljj committed Jan 10, 2024
2 parents a677364 + 8976a92 commit ee45525
Show file tree
Hide file tree
Showing 166 changed files with 51,690 additions and 58,003 deletions.
8 changes: 4 additions & 4 deletions sandbox tasks/sandbox_audio_quality/batch.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
audio1,audio2,audio3,audio4,audio5,audio6,slider1,Answer.slider2,Answer.slider3,Answer.slider4,Answer.slider5,Answer.slider6
215ee616/16.wav,395f0816/16.wav,62ebbd69/16.wav,62ebbd69_m/16.wav,215ee616_m/16.wav,aa73ae27_m/16.wav,,,,,,
62ebbd69_m/7.wav,215ee616/7.wav,62ebbd69/7.wav,215ee616_m/7.wav,395f0816/7.wav,aa73ae27_m/7.wav,,,,,,
395f0816/18.wav,215ee616_m/18.wav,215ee616/18.wav,62ebbd69/18.wav,aa73ae27_m/18.wav,62ebbd69_m/18.wav,,,,,,
62ebbd69_m/12.wav,215ee616/12.wav,395f0816/12.wav,62ebbd69/12.wav,aa73ae27_m/12.wav,215ee616_m/12.wav,,,,,,
215ee616/16.wav,395f0816/16.wav,62ebbd69/16.wav,62ebbd69_m/16.wav,215ee616_m/16.wav,aa73ae27_m/16.wav, ,,,,,
62ebbd69_m/7.wav,215ee616/7.wav,62ebbd69/7.wav,215ee616_m/7.wav,395f0816/7.wav,aa73ae27_m/7.wav, ,,,,,
395f0816/18.wav,215ee616_m/18.wav,215ee616/18.wav,62ebbd69/18.wav,aa73ae27_m/18.wav,62ebbd69_m/18.wav, ,,,,,
62ebbd69_m/12.wav,215ee616/12.wav,395f0816/12.wav,62ebbd69/12.wav,aa73ae27_m/12.wav,215ee616_m/12.wav, ,,,,,
2 changes: 1 addition & 1 deletion sandbox tasks/sandbox_figure_descriptions/batch.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
diagrams description0,diagrams description1,diagrams description2,"Diagram of. The image is a black and white picture of a cone with a circular hole in the middle, placed on a white background. The cone has a height of 2 units and a base of 1 unit.",https://web-instruct.s3.amazonaws.com/diagrams/tikz/324.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/689.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/429.pdf,https://web-instruct.s3.amazonaws.com/diagrams/tikz/286.pdf
diagrams description0,diagrams description1,diagrams description2,"Illustration of x, for x = 1001 and = (3,1,2,4) from def:block. The image features a complex network of red and green lines intersecting in a white background. The lines are placed at various angles, creating a visually intriguing pattern. The red and green colors seem to be organized in a specific way, possibly forming a code or a mathematical representation. ",https://web-instruct.s3.amazonaws.com/diagrams/tikz/454.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/429.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/324.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/689.pdf
diagrams description0,diagrams description1,diagrams description2,"Aligning the second line of text in a node to be vertically centered, while also adjusting the size and positioning of text in the first type of node. ",https://web-instruct.s3.amazonaws.com/diagrams/tikz/324.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/429.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/689.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/135.pdf
diagrams description0,diagrams description1,diagrams description2,"Uplink SE per UE versus the number of ueK for two combining schemes with the proposed pnalmmse, single-carrier pnalmmse, and pnummse estimators for ^2_=^2_= 3.5 10^-4. ",https://web-instruct.s3.amazonaws.com/diagrams/tikz/951.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/514.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/689.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/468.pdf
diagrams description0,diagrams description1,diagrams description2,"Uplink SE per UE versus the number of ueK for two combining schemes with the proposed pnalmmse, single-carrier pnalmmse, and pnummse estimators for ^2_=^2_= 3.5 10^-4. ",https://web-instruct.s3.amazonaws.com/diagrams/tikz/951.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/514.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/689.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/468.pdf
4 changes: 2 additions & 2 deletions sandbox tasks/sandbox_lamecows/batch.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Title,Description,Keywords,Task
Cows,Cows,Cows,Cows
Title,Description,Keywords,Task
Cows,Cows,Cows,Cows
4 changes: 2 additions & 2 deletions sandbox tasks/sandbox_scambaiting/batch.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Title,Description,Keywords,video_id
Title,Description,Keywords,video_id
scambaiting,scambaiting,scambaiting,QwUfYlYcCY0
scambaiting,scambaiting,scambaiting,0n0Pd2nlJHA
scambaiting,scambaiting,scambaiting,aOj3rKco8fc
Expand All @@ -9,4 +9,4 @@ scambaiting,scambaiting,scambaiting,Nc-9tLP5E1E
scambaiting,scambaiting,scambaiting,ByTV4f4gbzQ
scambaiting,scambaiting,scambaiting,cG4MRUx8p6I
scambaiting,scambaiting,scambaiting,1Tfi8bm2caE
scambaiting,scambaiting,scambaiting,y0kXxHDT1QM
scambaiting,scambaiting,scambaiting,y0kXxHDT1QM
3 changes: 2 additions & 1 deletion src/TAP_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ def test_evaluation():
results = evaluation.enumerate_tap_tasks_random(max_instance_count=2) # dictionary of results
else:
# dictionary mapping {task_name, {num_successes, num_errors, num_failing, sum_failing_scores} }
results = evaluation.enumerate_tap_tasks(max_instance_count=1000) # dictionary of results
max_instance_count = 1000
results = evaluation.enumerate_tap_tasks(max_instance_count=max_instance_count) # dictionary of results

# Global statistics
tasks_succeeded = 0
Expand Down
56 changes: 40 additions & 16 deletions src/evaluation_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,11 @@ def filter_TAP_tasks(self, task_name):
if task_name in tasks_should_skip:
return False

# TODO: drop this?
bad_data = ["Elicitation Generation"]
if task_name in bad_data:
return False

if task_name not in self.task_ids.keys():
print(f"{Fore.RED}Task `{task_name}` is not available on Turkle.")
print("Available tasks are:", self.task_ids.keys())
Expand Down Expand Up @@ -181,11 +186,14 @@ def load_split_tasks(self, partitions: int):
# Greedy optimized way to split evenly
s = set() # was originally a set, but python sets aren't as robust as C++ std
sum = 0
max_instance_count = 1000
for task in all_tasks:
df = pd.read_csv(f'../tasks/{task}/batch.csv', nrows=0)
input_names = [col[len('Answer.'):] for col in df.columns if col.startswith('Answer.')]
val = min(1000, len(self.task_ids[task])) * (
8 + len(input_names)) # num_tasks * num_inputs_per_task + 8 * num_tasks
val = min(
max_instance_count,
len(self.task_ids[task])
) * (8 + len(input_names)) # num_tasks * num_inputs_per_task + 8 * num_tasks
sum += val
s.add((val, task)) # (val, task name)

Expand Down Expand Up @@ -505,6 +513,10 @@ def retrieve_gold_labels(self, task_name: str, instance_index: int, input_names:
# this will be a df with multiple rows iff there are multiple answers to the same question instance
df_subset = df[df[cols].eq(row).all(1)]


# bringing this back in to check for errors in tap test 18
assert len(df_subset) > 0, f"Could not find any answers for the instance index {instance_index}."

# create a map for each Answer (input_name) to its corresponding answers of the instance
answers_map = {
input_name: df_subset.get(f"Answer.{input_name}", np.array([])).tolist() for input_name in input_names
Expand Down Expand Up @@ -653,10 +665,11 @@ def score_outputs(self, inputs: List[Input], answers_map: Dict, task_results: Di
print(f'{Fore.RED}Skipping element `{i.name}` since it is not visible.')
continue

if i.values != i.visible_values:
raise Exception(
f"{Fore.RED}The values `{i.values}` and visible values `{i.visible_values}` should be the same for `{i}`"
)
# temp commenting out of this visible values to see what files in TAP tests 18 need to have their ending rows deleted
# if i.values != i.visible_values:
# raise Exception(
# f"{Fore.RED}The values `{i.values}` and visible values `{i.visible_values}` should be the same for `{i}`"
# )

# if the answer is already empty for text input, skip it.
# otherwise, we would be crediting the model for not filling in the input.
Expand Down Expand Up @@ -697,7 +710,6 @@ def enumerate_tasks(self, max_instance_count: int, **kwargs):
Enumerate the tasks and their instances for the main evaluation loop to go through.
:param max_instance_count: maximum number of instances per task
"""

if self.tasks.startswith("dmp"):
# TODO: explain what this is
tasks = self.load_split_tasks(kwargs.get("dump_partitions"))
Expand Down Expand Up @@ -867,7 +879,7 @@ def enumerate_tasks(self, max_instance_count: int, **kwargs):
per_task_score += score

if self.solver_type == 'oracle':
assert score > 0.9, f"{Fore.RED}The oracle baseline should always get a score of 1.0"
assert score > 0.99, f"{Fore.RED}The oracle baseline should always get a score of 1.0"
elif self.solver_type == 'model':
kwargs["scores"].append(score)

Expand Down Expand Up @@ -961,6 +973,7 @@ def enumerate_tap_tasks(self, max_instance_count: int):
with HiddenPrintsHiddenErrors():
for instance_id in instance_ids:
row_num = instance_id - first_instance_id
error_flag = False

url = f'{TURKLE_URL}/task/{instance_id}/iframe/'
self.driver.get(url)
Expand All @@ -970,16 +983,24 @@ def enumerate_tap_tasks(self, max_instance_count: int):
input_names = [col[len('Answer.'):] for col in df.columns if col.startswith('Answer.')]
inputs = self.extract_input_values_from_url(url=url, task_name=task_name, input_names=input_names)

answers_map = self.retrieve_gold_labels(
task_name, row_num, [x.name for x in inputs]
)
# Add stuff from kevin-2 to skip out on these answer_map
try:
answers_map = self.retrieve_gold_labels(
task_name, row_num, [x.name for x in inputs]
)
except:
error_flag = True

if error_flag:
num_errors += 1
failing_tasks.append(row_num)
continue

# Same TODO as above, file (images videos audio, css etc. are html accessible and find all URLs)

# TODO copy over dump_features
# TODO copy over report_field_stats so task_field_statistics

error_flag = False
# for each input, now go ahead and answer it with oracle
for input_idx, i in enumerate(inputs):
element = self.driver.find_element(By.NAME, i.name)
Expand Down Expand Up @@ -1014,7 +1035,7 @@ def enumerate_tap_tasks(self, max_instance_count: int):

score = self.score_outputs(inputs, answers_map, task_results=None)

if score > 0.9:
if score > 0.99:
num_successes += 1
else:
failing_tasks.append(row_num)
Expand Down Expand Up @@ -1120,9 +1141,12 @@ def enumerate_tap_tasks_random(self, max_instance_count: int):
continue

failing_tasks = failing_tasks[:10] # only keep the first 10 failing tasks
task_results[task_name] = {"num_successes": num_successes, "num_errors": num_errors,
"num_failing": len(instance_ids) - num_successes - num_errors,
"sum_failing_scores": sum_failing_scores, "failing_tasks": failing_tasks}
task_results[task_name] = {
"num_successes": num_successes,
"num_errors": num_errors,
"num_failing": len(instance_ids) - num_successes - num_errors,
"sum_failing_scores": sum_failing_scores, "failing_tasks": failing_tasks
}
print("task result", task_name, task_results[task_name])

return task_results
4 changes: 2 additions & 2 deletions src/run_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
import logging

TURKLE_URL = "http://localhost:8000"
TEST_NAME = "COMET2020 ATOMIC Inference Vp 5"
SPECIFIED_INDEX = 0
TEST_NAME = "Annotation subj_obj"
SPECIFIED_INDEX = 46
RUN_ALL = False

class Run(evaluation_class.Evaluation):
Expand Down
37 changes: 31 additions & 6 deletions src/utils/clean_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,16 +94,41 @@ def clean_empty(csv_file):
df = pd.read_csv(csv_file, low_memory=False)
for i, row in df.iterrows():
for col in df.columns:
if col.startswith("Answer."):
if not (col.startswith("Answer.") and col.endswith("subject")):
continue
if pd.isnull(row[col]) or row[col] == "":
df.loc[i, col] = "Empty"
df.loc[i, col] = "--NO SUBJECT--"
df.to_csv(csv_file , encoding='utf-8-sig', index=False)

# Make certain columns empty if not specific values
def make_empty(csv_file):
df = pd.read_csv(csv_file, low_memory=False)
for i, row in df.iterrows():
for col in df.columns:
if not (col.startswith("Answer.") and col.endswith("box")):
continue
if pd.isnull(row[col]) or row[col] == "" or row[col] == "no":
continue

df.loc[i, col] = ""
df.to_csv(csv_file , encoding='utf-8-sig', index=False)

# Clean an "Empty" cell with certain properties (in certain col) into an actually empty cell, reversing clean_empty
def clean_unempty(csv_file):
df = pd.read_csv(csv_file, low_memory=False)
for i, row in df.iterrows():
for col in df.columns:
if not col.startswith("Answer."):
continue
if row[col] == "Empty":
df.loc[i, col] = ""
df.to_csv(csv_file , encoding='utf-8-sig', index=False)

if __name__ == '__main__':
files_to_edit = ["wikiHow Goal Membership"]
files_to_edit = ["Annotation subj_obj"]
for root, dirs, files in os.walk('tasks'):
for file in files:
if file.endswith('.csv') and root.split("/")[1] in files_to_edit and file.startswith('batch'):
print('Cleaning ' + file)
clean_split_up_radio(os.path.join(root, file))
if file.endswith('.csv') and file.startswith('batch'):
if root.split("/")[1] in files_to_edit:
print('Cleaning ' + os.path.join(root, file))
make_empty(os.path.join(root, file))
Loading

0 comments on commit ee45525

Please sign in to comment.