Skip to content

Commit

Permalink
Fix issue created by last fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Guillaume Millot authored and Guillaume Millot committed Apr 30, 2024
1 parent 49ce7b7 commit cba3c42
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 13 deletions.
15 changes: 3 additions & 12 deletions eval/eval_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@
from streamlit_option_menu import option_menu
from utils import (
append_count_to_duplicates,
clean_headers,
convert_to_str,
fill_df_empty_headers,
reformat_str,
)

Expand Down Expand Up @@ -243,17 +243,8 @@ def process_pdf(pdf_file: str, asset_dict: dict) -> None:
# Pull selected table
df = dfs[selected_idx]

# Erase any "Unnamed" headers originating from html to df conversion
# Test with Unstructured detectron2_onnx applied to ACS_2019.pdf
clean_columns = []
for col in df.columns:
clean_columns.append([item for item in col if "Unnamed" not in item])
# Convert any multi-row headers to single row to prevent st.dataframe error
# Test with Unstructured detectron2_onnx applied to ACS_2019.pdf
df.columns = [": ".join(set(col)) for col in clean_columns]

# Fill any empty headers to prevent st.dataframe error
fill_df_empty_headers(df)
# Clean headers to prevent any st.dataframe error
clean_headers(df)

# Check if values in table are in tables of reference extraction
refvalues = []
Expand Down
16 changes: 15 additions & 1 deletion eval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,21 @@ def reformat_str(el: any) -> str:
return re.sub(r"\((\d+)\)", r"-\1", el)


def fill_df_empty_headers(df: pd) -> str:
def clean_headers(df: pd) -> str:

# Transform any multi-row headers to single row to prevent st.dataframe error
# Test with Unstructured detectron2_onnx applied to ACS_2019.pdf
if isinstance(df.columns, pd.MultiIndex):
# Erase first any "Unnamed" headers originating from html to df conversion
clean_columns = []
for col in df.columns:
clean_columns.append(
[item for item in col if "Unnamed" not in item],
)

df.columns = [": ".join(set(col)) for col in clean_columns]

# Fill any empty headers
if df.columns.duplicated().sum() > 0:
cols = pd.Series(df.columns)
for dup in set(df.columns[df.columns.duplicated()]):
Expand Down

0 comments on commit cba3c42

Please sign in to comment.