Fix issue created by last fix

dataforgoodfr · Apr 30, 2024 · cba3c42 · cba3c42
1 parent 49ce7b7
commit cba3c42
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 13 deletions.
diff --git a/eval/eval_app.py b/eval/eval_app.py
@@ -35,8 +35,8 @@
 from streamlit_option_menu import option_menu
 from utils import (
     append_count_to_duplicates,
+    clean_headers,
     convert_to_str,
-    fill_df_empty_headers,
     reformat_str,
 )
 
@@ -243,17 +243,8 @@ def process_pdf(pdf_file: str, asset_dict: dict) -> None:
                 # Pull selected table
                 df = dfs[selected_idx]
 
-                # Erase any "Unnamed" headers originating from html to df conversion
-                # Test with Unstructured detectron2_onnx applied to ACS_2019.pdf
-                clean_columns = []
-                for col in df.columns:
-                    clean_columns.append([item for item in col if "Unnamed" not in item])
-                # Convert any multi-row headers to single row to prevent st.dataframe error
-                # Test with Unstructured detectron2_onnx applied to ACS_2019.pdf
-                df.columns = [": ".join(set(col)) for col in clean_columns]
-
-                # Fill any empty headers to prevent st.dataframe error
-                fill_df_empty_headers(df)
+                # Clean headers to prevent any st.dataframe error
+                clean_headers(df)
 
                 # Check if values in table are in tables of reference extraction
                 refvalues = []

diff --git a/eval/utils.py b/eval/utils.py
@@ -56,7 +56,21 @@ def reformat_str(el: any) -> str:
     return re.sub(r"\((\d+)\)", r"-\1", el)
 
 
-def fill_df_empty_headers(df: pd) -> str:
+def clean_headers(df: pd) -> str:
+
+    # Transform any multi-row headers to single row to prevent st.dataframe error
+    # Test with Unstructured detectron2_onnx applied to ACS_2019.pdf
+    if isinstance(df.columns, pd.MultiIndex):
+        # Erase first any "Unnamed" headers originating from html to df conversion
+        clean_columns = []
+        for col in df.columns:
+            clean_columns.append(
+                [item for item in col if "Unnamed" not in item],
+            )
+
+        df.columns = [": ".join(set(col)) for col in clean_columns]
+
+    # Fill any empty headers
     if df.columns.duplicated().sum() > 0:
         cols = pd.Series(df.columns)
         for dup in set(df.columns[df.columns.duplicated()]):