- merge with the main branch.

JHU-CLSP · Jan 10, 2024 · ee45525 · ee45525
2 parents a677364 + 8976a92
commit ee45525
Show file tree

Hide file tree

Showing 166 changed files with 51,690 additions and 58,003 deletions.
diff --git a/sandbox tasks/sandbox_audio_quality/batch.csv b/sandbox tasks/sandbox_audio_quality/batch.csv
@@ -1,5 +1,5 @@
 audio1,audio2,audio3,audio4,audio5,audio6,slider1,Answer.slider2,Answer.slider3,Answer.slider4,Answer.slider5,Answer.slider6
-215ee616/16.wav,395f0816/16.wav,62ebbd69/16.wav,62ebbd69_m/16.wav,215ee616_m/16.wav,aa73ae27_m/16.wav,,,,,,
-62ebbd69_m/7.wav,215ee616/7.wav,62ebbd69/7.wav,215ee616_m/7.wav,395f0816/7.wav,aa73ae27_m/7.wav,,,,,,
-395f0816/18.wav,215ee616_m/18.wav,215ee616/18.wav,62ebbd69/18.wav,aa73ae27_m/18.wav,62ebbd69_m/18.wav,,,,,,
-62ebbd69_m/12.wav,215ee616/12.wav,395f0816/12.wav,62ebbd69/12.wav,aa73ae27_m/12.wav,215ee616_m/12.wav,,,,,,
+215ee616/16.wav,395f0816/16.wav,62ebbd69/16.wav,62ebbd69_m/16.wav,215ee616_m/16.wav,aa73ae27_m/16.wav, ,,,,,
+62ebbd69_m/7.wav,215ee616/7.wav,62ebbd69/7.wav,215ee616_m/7.wav,395f0816/7.wav,aa73ae27_m/7.wav, ,,,,,
+395f0816/18.wav,215ee616_m/18.wav,215ee616/18.wav,62ebbd69/18.wav,aa73ae27_m/18.wav,62ebbd69_m/18.wav, ,,,,,
+62ebbd69_m/12.wav,215ee616/12.wav,395f0816/12.wav,62ebbd69/12.wav,aa73ae27_m/12.wav,215ee616_m/12.wav, ,,,,,
diff --git a/sandbox tasks/sandbox_figure_descriptions/batch.csv b/sandbox tasks/sandbox_figure_descriptions/batch.csv
@@ -2,4 +2,4 @@
 diagrams description0,diagrams description1,diagrams description2,"Diagram of. The image is a black and white picture of a cone with a circular hole in the middle, placed on a white background. The cone has a height of 2 units and a base of 1 unit.",https://web-instruct.s3.amazonaws.com/diagrams/tikz/324.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/689.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/429.pdf,https://web-instruct.s3.amazonaws.com/diagrams/tikz/286.pdf
 diagrams description0,diagrams description1,diagrams description2,"Illustration of x, for x = 1001 and = (3,1,2,4) from def:block. The image features a complex network of red and green lines intersecting in a white background. The lines are placed at various angles, creating a visually intriguing pattern. The red and green colors seem to be organized in a specific way, possibly forming a code or a mathematical representation. ",https://web-instruct.s3.amazonaws.com/diagrams/tikz/454.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/429.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/324.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/689.pdf
 diagrams description0,diagrams description1,diagrams description2,"Aligning the second line of text in a node to be vertically centered, while also adjusting the size and positioning of text in the first type of node. ",https://web-instruct.s3.amazonaws.com/diagrams/tikz/324.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/429.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/689.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/135.pdf
-diagrams description0,diagrams description1,diagrams description2,"Uplink SE per UE versus the number of ueK for two combining schemes with the proposed pnalmmse, single-carrier pnalmmse, and pnummse estimators for ^2_=^2_= 3.5 10^-4. ",https://web-instruct.s3.amazonaws.com/diagrams/tikz/951.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/514.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/689.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/468.pdf
+diagrams description0,diagrams description1,diagrams description2,"Uplink SE per UE versus the number of ueK for two combining schemes with the proposed pnalmmse, single-carrier pnalmmse, and pnummse estimators for ^2_=^2_= 3.5 10^-4. ",https://web-instruct.s3.amazonaws.com/diagrams/tikz/951.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/514.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/689.pdf ,https://web-instruct.s3.amazonaws.com/diagrams/tikz/468.pdf
diff --git a/sandbox tasks/sandbox_lamecows/batch.csv b/sandbox tasks/sandbox_lamecows/batch.csv
@@ -1,2 +1,2 @@
-Title,Description,Keywords,Task
-Cows,Cows,Cows,Cows
+Title,Description,Keywords,Task
+Cows,Cows,Cows,Cows
diff --git a/sandbox tasks/sandbox_scambaiting/batch.csv b/sandbox tasks/sandbox_scambaiting/batch.csv
@@ -1,4 +1,4 @@
-Title,Description,Keywords,video_id
+Title,Description,Keywords,video_id
 scambaiting,scambaiting,scambaiting,QwUfYlYcCY0
 scambaiting,scambaiting,scambaiting,0n0Pd2nlJHA
 scambaiting,scambaiting,scambaiting,aOj3rKco8fc
@@ -9,4 +9,4 @@ scambaiting,scambaiting,scambaiting,Nc-9tLP5E1E
 scambaiting,scambaiting,scambaiting,ByTV4f4gbzQ
 scambaiting,scambaiting,scambaiting,cG4MRUx8p6I
 scambaiting,scambaiting,scambaiting,1Tfi8bm2caE
-scambaiting,scambaiting,scambaiting,y0kXxHDT1QM
+scambaiting,scambaiting,scambaiting,y0kXxHDT1QM
diff --git a/src/TAP_tests.py b/src/TAP_tests.py
@@ -14,7 +14,8 @@ def test_evaluation():
         results = evaluation.enumerate_tap_tasks_random(max_instance_count=2) # dictionary of results
     else:
         # dictionary mapping {task_name, {num_successes, num_errors, num_failing, sum_failing_scores} }
-        results = evaluation.enumerate_tap_tasks(max_instance_count=1000) # dictionary of results
+        max_instance_count = 1000
+        results = evaluation.enumerate_tap_tasks(max_instance_count=max_instance_count) # dictionary of results
 
     # Global statistics
     tasks_succeeded = 0

diff --git a/src/evaluation_class.py b/src/evaluation_class.py
@@ -144,6 +144,11 @@ def filter_TAP_tasks(self, task_name):
         if task_name in tasks_should_skip:
             return False
 
+        # TODO: drop this?
+        bad_data = ["Elicitation Generation"]
+        if task_name in bad_data:
+            return False
+
         if task_name not in self.task_ids.keys():
             print(f"{Fore.RED}Task `{task_name}` is not available on Turkle.")
             print("Available tasks are:", self.task_ids.keys())
@@ -181,11 +186,14 @@ def load_split_tasks(self, partitions: int):
         # Greedy optimized way to split evenly
         s = set()  # was originally a set, but python sets aren't as robust as C++ std
         sum = 0
+        max_instance_count = 1000
         for task in all_tasks:
             df = pd.read_csv(f'../tasks/{task}/batch.csv', nrows=0)
             input_names = [col[len('Answer.'):] for col in df.columns if col.startswith('Answer.')]
-            val = min(1000, len(self.task_ids[task])) * (
-                        8 + len(input_names))  # num_tasks * num_inputs_per_task + 8 * num_tasks
+            val = min(
+                max_instance_count,
+                len(self.task_ids[task])
+            ) * (8 + len(input_names))  # num_tasks * num_inputs_per_task + 8 * num_tasks
             sum += val
             s.add((val, task))  # (val, task name)
 
@@ -505,6 +513,10 @@ def retrieve_gold_labels(self, task_name: str, instance_index: int, input_names:
         # this will be a df with multiple rows iff there are multiple answers to the same question instance
         df_subset = df[df[cols].eq(row).all(1)]
 
+
+        # bringing this back in to check for errors in tap test 18
+        assert len(df_subset) > 0, f"Could not find any answers for the instance index {instance_index}."
+
         # create a map for each Answer (input_name) to its corresponding answers of the instance
         answers_map = {
             input_name: df_subset.get(f"Answer.{input_name}", np.array([])).tolist() for input_name in input_names
@@ -653,10 +665,11 @@ def score_outputs(self, inputs: List[Input], answers_map: Dict, task_results: Di
                 print(f'{Fore.RED}Skipping element `{i.name}` since it is not visible.')
                 continue
 
-            if i.values != i.visible_values:
-                raise Exception(
-                    f"{Fore.RED}The values `{i.values}` and visible values `{i.visible_values}` should be the same for `{i}`"
-                )
+            # temp commenting out of this visible values to see what files in TAP tests 18 need to have their ending rows deleted
+            # if i.values != i.visible_values:
+            #     raise Exception(
+            #         f"{Fore.RED}The values `{i.values}` and visible values `{i.visible_values}` should be the same for `{i}`"
+            #     )
 
             # if the answer is already empty for text input, skip it.
             # otherwise, we would be crediting the model for not filling in the input.
@@ -697,7 +710,6 @@ def enumerate_tasks(self, max_instance_count: int, **kwargs):
         Enumerate the tasks and their instances for the main evaluation loop to go through.
         :param max_instance_count: maximum number of instances per task
         """
-
         if self.tasks.startswith("dmp"):
             # TODO: explain what this is
             tasks = self.load_split_tasks(kwargs.get("dump_partitions"))
@@ -867,7 +879,7 @@ def enumerate_tasks(self, max_instance_count: int, **kwargs):
                 per_task_score += score
 
                 if self.solver_type == 'oracle':
-                    assert score > 0.9, f"{Fore.RED}The oracle baseline should always get a score of 1.0"
+                    assert score > 0.99, f"{Fore.RED}The oracle baseline should always get a score of 1.0"
                 elif self.solver_type == 'model':
                     kwargs["scores"].append(score)
 
@@ -961,6 +973,7 @@ def enumerate_tap_tasks(self, max_instance_count: int):
             with HiddenPrintsHiddenErrors():
                 for instance_id in instance_ids:
                     row_num = instance_id - first_instance_id
+                    error_flag = False
 
                     url = f'{TURKLE_URL}/task/{instance_id}/iframe/'
                     self.driver.get(url)
@@ -970,16 +983,24 @@ def enumerate_tap_tasks(self, max_instance_count: int):
                     input_names = [col[len('Answer.'):] for col in df.columns if col.startswith('Answer.')]
                     inputs = self.extract_input_values_from_url(url=url, task_name=task_name, input_names=input_names)
 
-                    answers_map = self.retrieve_gold_labels(
-                        task_name, row_num, [x.name for x in inputs]
-                    )
+                    # Add stuff from kevin-2 to skip out on these answer_map
+                    try:
+                        answers_map = self.retrieve_gold_labels(
+                            task_name, row_num, [x.name for x in inputs]
+                        )
+                    except:
+                        error_flag = True
+
+                        if error_flag:
+                            num_errors += 1
+                            failing_tasks.append(row_num)
+                            continue
 
                     # Same TODO as above, file (images videos audio, css etc. are html accessible and find all URLs)
 
                     # TODO copy over dump_features
                     # TODO copy over report_field_stats so task_field_statistics
 
-                    error_flag = False
                     # for each input, now go ahead and answer it with oracle
                     for input_idx, i in enumerate(inputs):
                         element = self.driver.find_element(By.NAME, i.name)
@@ -1014,7 +1035,7 @@ def enumerate_tap_tasks(self, max_instance_count: int):
 
                     score = self.score_outputs(inputs, answers_map, task_results=None)
 
-                    if score > 0.9:
+                    if score > 0.99:
                         num_successes += 1
                     else:
                         failing_tasks.append(row_num)
@@ -1120,9 +1141,12 @@ def enumerate_tap_tasks_random(self, max_instance_count: int):
                         continue
 
             failing_tasks = failing_tasks[:10]  # only keep the first 10 failing tasks
-            task_results[task_name] = {"num_successes": num_successes, "num_errors": num_errors,
-                                       "num_failing": len(instance_ids) - num_successes - num_errors,
-                                       "sum_failing_scores": sum_failing_scores, "failing_tasks": failing_tasks}
+            task_results[task_name] = {
+                "num_successes": num_successes,
+                "num_errors": num_errors,
+                "num_failing": len(instance_ids) - num_successes - num_errors,
+                "sum_failing_scores": sum_failing_scores, "failing_tasks": failing_tasks
+            }
             print("task result", task_name, task_results[task_name])
 
         return task_results
diff --git a/src/run_single.py b/src/run_single.py
@@ -11,8 +11,8 @@
 import logging
 
 TURKLE_URL = "http://localhost:8000"
-TEST_NAME = "COMET2020 ATOMIC Inference Vp 5"
-SPECIFIED_INDEX = 0
+TEST_NAME = "Annotation subj_obj"
+SPECIFIED_INDEX = 46
 RUN_ALL = False
 
 class Run(evaluation_class.Evaluation):

diff --git a/src/utils/clean_csv.py b/src/utils/clean_csv.py
@@ -94,16 +94,41 @@ def clean_empty(csv_file):
     df = pd.read_csv(csv_file, low_memory=False)
     for i, row in df.iterrows():
         for col in df.columns:
-            if col.startswith("Answer."):
+            if not (col.startswith("Answer.") and col.endswith("subject")):
                 continue
             if pd.isnull(row[col]) or row[col] == "":
-                df.loc[i, col] = "Empty"
+                df.loc[i, col] = "--NO SUBJECT--"
+    df.to_csv(csv_file , encoding='utf-8-sig', index=False)
+
+# Make certain columns empty if not specific values
+def make_empty(csv_file):
+    df = pd.read_csv(csv_file, low_memory=False)
+    for i, row in df.iterrows():
+        for col in df.columns:
+            if not (col.startswith("Answer.") and col.endswith("box")):
+                continue
+            if pd.isnull(row[col]) or row[col] == "" or row[col] == "no":
+                continue
+
+            df.loc[i, col] = ""
+    df.to_csv(csv_file , encoding='utf-8-sig', index=False)
+
+# Clean an "Empty" cell with certain properties (in certain col) into an actually empty cell, reversing clean_empty 
+def clean_unempty(csv_file):
+    df = pd.read_csv(csv_file, low_memory=False)
+    for i, row in df.iterrows():
+        for col in df.columns:
+            if not col.startswith("Answer."):
+                continue
+            if row[col] == "Empty":
+                df.loc[i, col] = ""
     df.to_csv(csv_file , encoding='utf-8-sig', index=False)
 
 if __name__ == '__main__':
-    files_to_edit = ["wikiHow Goal Membership"]
+    files_to_edit = ["Annotation subj_obj"]
     for root, dirs, files in os.walk('tasks'):
         for file in files:
-            if file.endswith('.csv') and root.split("/")[1] in files_to_edit and file.startswith('batch'):
-                print('Cleaning ' + file)
-                clean_split_up_radio(os.path.join(root, file))
+            if file.endswith('.csv') and file.startswith('batch'):
+                if root.split("/")[1] in files_to_edit:
+                    print('Cleaning ' + os.path.join(root, file))
+                    make_empty(os.path.join(root, file))