diff --git a/src/4_run_evaluation.py b/src/4_run_evaluation.py
index 6d71704e..0b812e66 100644
--- a/src/4_run_evaluation.py
+++ b/src/4_run_evaluation.py
@@ -77,8 +77,8 @@
         use_relevant_html=args.use_relevant_html
     )
 
-    eval.enumerate_tasks(max_instance_count)
+    # eval.enumerate_tasks(max_instance_count)
     # Debugging mode
-    # eval.enumerate_tasks(max_instance_count, task="ethics_sbic dialogue 2nd 0", first_instance_only=True)
+    eval.enumerate_tasks(max_instance_count, task="ethics_sbic dialogue 2nd 0", first_instance_only=True)
     # Collecting example code: python 4_run_evaluation.py --no-do_eval --headless > extract.txt
     # eval.enumerate_tasks(max_instance_count, task="ethics_sbic dialogue 2nd 0", first_instance_only=True, input_name="norm")
diff --git a/src/evaluation_class.py b/src/evaluation_class.py
index 2edc6164..a85f96e6 100644
--- a/src/evaluation_class.py
+++ b/src/evaluation_class.py
@@ -943,53 +943,53 @@ def enumerate_tasks(self, max_instance_count: int, **kwargs):
                     elif self.solver_type == 'model':
                         kwargs["scores"].append(score)
 
-            if self.do_eval:
-                # per-task statistics
-                per_task_score = per_task_score / len(instance_ids)
-                print(f"{Fore.MAGENTA}Task: {task_name} --> Score: {per_task_score}")
-                df = pd.DataFrame()
-                for task_name, inputs in results.items():
-                    all_scores = []
-                    for input_type, scores in inputs.items():
-                        avg_score = sum(scores) / len(scores)
-                        all_scores.extend(scores)
-                        df = pd.concat(
-                            [
-                                df, pd.DataFrame({
-                                'project': [task_name],
-                                'input_type': [input_type],
-                                'score': [avg_score]
-                            })
-                            ],
-                            ignore_index=True)
-
-
-                    # add the overall score across all the inputs
-                    df = pd.concat([
-                        df, pd.DataFrame({
+        if self.do_eval:
+            # per-task statistics
+            per_task_score = per_task_score / len(instance_ids)
+            print(f"{Fore.MAGENTA}Task: {task_name} --> Score: {per_task_score}")
+            df = pd.DataFrame()
+            for task_name, inputs in results.items():
+                all_scores = []
+                for input_type, scores in inputs.items():
+                    avg_score = sum(scores) / len(scores)
+                    all_scores.extend(scores)
+                    df = pd.concat(
+                        [
+                            df, pd.DataFrame({
                             'project': [task_name],
-                            'input_type': ["all"],
-                            'score': [sum(all_scores) / len(all_scores)]
-                        }
-                        )], ignore_index=True
-                    )
-
-                if 'project' not in df.columns:
-                    df.insert(0, 'project', '')
-                if 'input_type' not in df.columns:
-                    df.insert(1, 'input_type', '')
-                if 'score' not in df.columns:
-                    df.insert(1, 'score', '')
-
-                df = df.pivot(index='project', columns='input_type', values='score')
-                today = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-                if self.solver_type == "text-vision" or self.solver_type == "gpt4-text-vision":
-                    csv_filename = f'{self.solver_type}_{self.num_demonstrations}_use-relevant-html_{self.use_relevant_html}_{self.tasks}_scores_{today}.csv'
-                df.to_csv(csv_filename, index=True)
-
-                # save results to json
-                with open(f'{self.solver_type}_scores_{today}.json', 'w') as f:
-                    json.dump(results, f, indent=4)
+                            'input_type': [input_type],
+                            'score': [avg_score]
+                        })
+                        ],
+                        ignore_index=True)
+
+
+                # add the overall score across all the inputs
+                df = pd.concat([
+                    df, pd.DataFrame({
+                        'project': [task_name],
+                        'input_type': ["all"],
+                        'score': [sum(all_scores) / len(all_scores)]
+                    }
+                    )], ignore_index=True
+                )
+
+            if 'project' not in df.columns:
+                df.insert(0, 'project', '')
+            if 'input_type' not in df.columns:
+                df.insert(1, 'input_type', '')
+            if 'score' not in df.columns:
+                df.insert(1, 'score', '')
+
+            df = df.pivot(index='project', columns='input_type', values='score')
+            today = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+            if self.solver_type == "text-vision" or self.solver_type == "gpt4-text-vision":
+                csv_filename = f'{self.solver_type}_{self.num_demonstrations}_use-relevant-html_{self.use_relevant_html}_{self.tasks}_scores_{today}.csv'
+            df.to_csv(csv_filename, index=True)
+
+            # save results to json
+            with open(f'{self.solver_type}_scores_{today}.json', 'w') as f:
+                json.dump(results, f, indent=4)
 
         if self.dump_features:
             with open(f'{directory}/{task_name}.json', 'w') as f:
diff --git a/src/text_gpt.txt b/src/text_gpt.txt
deleted file mode 100644
index 6dc4b5af..00000000
--- a/src/text_gpt.txt
+++ /dev/null
@@ -1,142 +0,0 @@
-Solver: gpt4-text-vision
-tasks ['ethics_sbic dialogue 2nd 0']
- = = = = = = = = = = = = starting new task: `ethics_sbic dialogue 2nd 0` = = = = = = = = = = = = 
-First instance id: 16430
-instance_id: 16430 <-> row_number: 0
- --> inputs: ['fix1', 'fix2', 'fix3', 'fix4', 'fix5', 'norm', 'socialnew', 'response', 'norm_fix', 'socialfix', 'certify-no-pii-response']
- --> Looking up gold labels from row index 0 of `input.csv` (unique inputs). 
- --> input labels: {'fix1': ['{}'], 'fix2': ["I'm shocked you would do this. Why would you do this?"], 'fix3': ['{}'], 'fix4': ['{}'], 'fix5': ['{}'], 'norm': ['s1'], 'socialnew': ['{}'], 'response': ["I'm sorry you had to put up with a customer like this. You shouldn't spill coffee on people though. It is hot and can really hurt them."], 'norm_fix': [''], 'socialfix': ['{}'], 'certify-no-pii-response': ['on']}
- - - - - - -  starting a new element: `Input(name=`fix1`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -  
-input: Input(name=`fix1`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: fix1
-Executing one action: self.actions.modify_text('fix1', 'I accidentally spilled coffee on a customer.')
-Add text `I accidentally spilled coffee on a customer.` to this text input: <textarea class="form-control" id="fix1" name="fix1" oninput="changeDialog(this);" placeholder="Modify: I poured the hot coffee on the customer" rows="2" style="width: 100%;"></textarea>
- - - - - - -  starting a new element: `Input(name=`fix2`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -  
-input: Input(name=`fix2`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: fix2
-Executing one action: self.actions.modify_text('fix2', 'I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?')
-Add text `I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` to this text input: <textarea class="form-control" id="fix2" name="fix2" oninput="changeDialog(this);" placeholder="Modify: Where were you when you did this? It doesn't sound like something you would do." rows="2" style="width: 100%;"></textarea>
- - - - - - -  starting a new element: `Input(name=`fix3`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -  
-input: Input(name=`fix3`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: fix3
-Executing one action: self.actions.modify_text('fix3', "I accidentally spilled coffee on a customer.")
-Add text `I accidentally spilled coffee on a customer.` to this text input: <textarea class="form-control" id="fix3" name="fix3" oninput="changeDialog(this);" placeholder="Modify: The customer made me mad." rows="2" style="width: 100%;"></textarea>
- - - - - - -  starting a new element: `Input(name=`fix4`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -  
-input: Input(name=`fix4`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: fix4
-Executing one action: self.actions.modify_text('fix4', 'I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?')
-Add text `I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` to this text input: <textarea class="form-control" id="fix4" name="fix4" oninput="changeDialog(this);" placeholder="Modify: It is not okay to pour coffee on someone on purpose.  It is especially not okay to do this at work." rows="4" style="width: 100%;"></textarea>
- - - - - - -  starting a new element: `Input(name=`fix5`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -  
-input: Input(name=`fix5`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: fix5
-Executing one action: self.actions.modify_text('fix5', 'I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?')
-Add text `I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` to this text input: <textarea class="form-control" id="fix5" name="fix5" oninput="changeDialog(this);" placeholder="Modify: The customer was an asshole, every day he complains about something." rows="2" style="width: 100%;"></textarea>
- - - - - - -  starting a new element: `Input(name=`norm`, type=`radio`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -  
-input: Input(name=`norm`, type=`radio`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: norm
-Executing one action: self.actions.modify_radio('norm', 's6')
-We are going to select this radio button: <input class="form-check-input" id="s6" name="norm" type="radio" value="s6" required="">
- - - - - - -  starting a new element: `Input(name=`socialnew`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -  
-input: Input(name=`socialnew`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: socialnew
-Executing one action: self.actions.modify_text('socialnew', 'It’s important to handle conflicts with customers professionally and calmly.')
-Add text `It’s important to handle conflicts with customers professionally and calmly.` to this text input: <textarea class="form-control" style="background-color:white" id="socialnew" name="socialnew" placeholder="Please write your rule-of-thumb here" rows="2"></textarea>
- - - - - - -  starting a new element: `Input(name=`response`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -  
-input: Input(name=`response`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: response
-Executing one action: self.actions.modify_text('response', 'I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?')
-Add text `I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` to this text input: <textarea class="form-control" id="response" name="response" oninput="updateCounter(this,1,3,30,400);changeDialog(this);" placeholder="Write your response here (> 30 characters)" required="" rows="5"></textarea>
- - - - - - -  starting a new element: `Input(name=`norm_fix`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -  
-input: Input(name=`norm_fix`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: norm_fix
-Executing one action: self.actions.modify_checkbox('norm_fix', 'on')
-Looking for checkboxes with `name`: `norm_fix` the values: `['on']`
- ** Warning **: input value is []. So, we're not going to modify the checkbox.
- - - - - - -  starting a new element: `Input(name=`socialfix`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -  
-input: Input(name=`socialfix`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: socialfix
-Executing one action: self.actions.modify_text('socialfix', 'It’s important to handle conflicts with customers professionally and calmly.')
-Add text `It’s important to handle conflicts with customers professionally and calmly.` to this text input: <textarea class="form-control" style="background-color:white" id="socialfix" name="socialfix" placeholder="Please write the revised rule-of-thumb here. If there are multiple RoTs, seperate them with semicolons (;)" rows="2"></textarea>
- - - - - - -  starting a new element: `Input(name=`certify-no-pii-response`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)` - - - - - -  
-input: Input(name=`certify-no-pii-response`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)
-about to try executing one action, on the following input: certify-no-pii-response
-Executing one action: self.actions.modify_checkbox('certify-no-pii-response', True)
-Looking for checkboxes with `name`: `certify-no-pii-response` the values: `['True']`
- ** Warning **: input value is []. So, we're not going to modify the checkbox.
-input: Input(name=`fix1`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['I accidentally spilled coffee on a customer.']
- visible_values : ['I accidentally spilled coffee on a customer.']
-input: Input(name=`fix2`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
- visible_values : ['I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
-input: Input(name=`fix3`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['I accidentally spilled coffee on a customer.']
- visible_values : ['I accidentally spilled coffee on a customer.']
-input: Input(name=`fix4`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
- visible_values : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
-input: Input(name=`fix5`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
- visible_values : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
-input: Input(name=`norm`, type=`radio`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['s6']
- visible_values : ['s6']
-input: Input(name=`socialnew`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['It’s important to handle conflicts with customers professionally and calmly.']
- visible_values : ['It’s important to handle conflicts with customers professionally and calmly.']
-input: Input(name=`response`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
- visible_values : ['I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?']
-input: Input(name=`norm_fix`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)
- visible : []
- visible_values : []
-input: Input(name=`socialfix`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['It’s important to handle conflicts with customers professionally and calmly.']
- visible_values : ['It’s important to handle conflicts with customers professionally and calmly.']
-input: Input(name=`certify-no-pii-response`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`)
- visible : ['on']
- visible_values : ['on']
- ------- evaluating input: Input(name=`fix1`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- 
-answers after mapping: `['']`
- ------- evaluating input: Input(name=`fix2`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- 
-answers after mapping: `["I'm shocked you would do this. Why would you do this?"]`
-----> answers: `["I'm shocked you would do this. Why would you do this?"]` - type: `<class 'list'>`
-----> baseline_answer: `I understand how a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` - type: `<class 'str'>`
- --> scores:  0.06666666666666667
-----> per-field score: 0.06666666666666667
- ------- evaluating input: Input(name=`fix3`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- 
-answers after mapping: `['']`
- ------- evaluating input: Input(name=`fix4`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- 
-answers after mapping: `['']`
- ------- evaluating input: Input(name=`fix5`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- 
-answers after mapping: `['']`
- ------- evaluating input: Input(name=`norm`, type=`radio`, task=`ethics_sbic dialogue 2nd 0`) ------- 
-----> answers: `['s1']` - type: `<class 'list'>`
-----> baseline_answer: `s6` - type: `<class 'str'>`
---> Computing the majority vote
- --> scores:  0.0
-----> per-field score: 0.0
- ------- evaluating input: Input(name=`socialnew`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- 
-answers after mapping: `['']`
- ------- evaluating input: Input(name=`response`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- 
-answers after mapping: `["I'm sorry you had to put up with a customer like this. You shouldn't spill coffee on people though. It is hot and can really hurt them."]`
-----> answers: `["I'm sorry you had to put up with a customer like this. You shouldn't spill coffee on people though. It is hot and can really hurt them."]` - type: `<class 'list'>`
-----> baseline_answer: `I understand how dealing with a difficult customer can be frustrating, but reacting aggressively can make things worse. What happened?` - type: `<class 'str'>`
- --> scores:  0.20408163265306123
-----> per-field score: 0.20408163265306123
- ------- evaluating input: Input(name=`norm_fix`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`) ------- 
-----> answers: `['']` - type: `<class 'list'>`
-----> baseline_answer: `` - type: `<class 'str'>`
-----> per-field score: 1.0
- ------- evaluating input: Input(name=`socialfix`, type=`textarea`, task=`ethics_sbic dialogue 2nd 0`) ------- 
-answers after mapping: `['']`
- ------- evaluating input: Input(name=`certify-no-pii-response`, type=`checkbox`, task=`ethics_sbic dialogue 2nd 0`) ------- 
-----> answers: `['on']` - type: `<class 'list'>`
-----> baseline_answer: `on` - type: `<class 'str'>`
-Model answers: on 
-Gold answers: ['on']
- --> scores:  1.0
-----> per-field score: 1.0
- --> Per-instance overall score: 0.45414965986394557
- --> Per-instance per-field breakdown: {'textarea': [0.06666666666666667, 0.20408163265306123], 'radio': [0.0], 'checkbox': [1.0, 1.0]}
-Task: ethics_sbic dialogue 2nd 0 --> Score: 0.45414965986394557