From 71c2f5a8281f1cedf44f3997baa9c3a14b61cfad Mon Sep 17 00:00:00 2001 From: Ronan Date: Thu, 18 Apr 2024 14:49:53 +0200 Subject: [PATCH 1/4] fix (Llama config): typing error + add logs --- app/pages/0_Import_File.py | 2 +- country_by_country/table_extraction/__init__.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/app/pages/0_Import_File.py b/app/pages/0_Import_File.py index 2e37f9c..785e91e 100644 --- a/app/pages/0_Import_File.py +++ b/app/pages/0_Import_File.py @@ -50,7 +50,7 @@ modelfile: random_forest_model_low_false_positive.joblib table_extraction: - - type: LLamaParse + - type: LlamaParse - type: Unstructured params: hi_res_model_name: "yolox" diff --git a/country_by_country/table_extraction/__init__.py b/country_by_country/table_extraction/__init__.py index b6a0c20..e61031c 100644 --- a/country_by_country/table_extraction/__init__.py +++ b/country_by_country/table_extraction/__init__.py @@ -21,12 +21,17 @@ # SOFTWARE. # Local imports +import logging +import sys + from .camelot_extractor import Camelot from .from_csv import FromCSV from .llama_parse_extractor import LlamaParseExtractor from .unstructured import Unstructured from .unstructured_api import UnstructuredAPI +logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s") + def from_config(config: dict) -> Camelot: extractor_type = config["type"] @@ -52,3 +57,5 @@ def from_config(config: dict) -> Camelot: from .extract_table_api import ExtractTableAPI return ExtractTableAPI(**extractor_params) + else: + logging.info(f"There are no extractors of the type : {extractor_type}") From e2ce6d3a48ab26bf6433ae236997764e979f643d Mon Sep 17 00:00:00 2001 From: Ronan Date: Mon, 22 Apr 2024 21:53:14 +0200 Subject: [PATCH 2/4] fixes : many : llama integration, merge behavior ... --- app/pages/2_Merge_Tables.py | 36 +++++++++---------- app/pages/3_Clean_Headers.py | 4 +-- app/pages/4_Clean_Tables.py | 18 +++++----- .../table_extraction/llama_parse_extractor.py | 10 ++++++ country_by_country/utils/utils.py | 33 +++++++---------- 5 files changed, 50 insertions(+), 51 deletions(-) diff --git a/app/pages/2_Merge_Tables.py b/app/pages/2_Merge_Tables.py index 1d337f3..3c322ce 100644 --- a/app/pages/2_Merge_Tables.py +++ b/app/pages/2_Merge_Tables.py @@ -16,16 +16,19 @@ def merge_table(table_extractor: str) -> None: - for asset in st.session_state["assets"]["table_extractors"]: - if asset["type"] == table_extractor: - first_df_columns = asset["tables"][0].columns + first_df_columns = pd.Series([]) + table_list = [] + for key, table in st.session_state["tables"].items(): + if table_extractor in key: + if first_df_columns.empty: + first_df_columns = table.columns # Replace column names for all DataFrames in the list - for df in asset["tables"]: - df.columns = first_df_columns + table.columns = first_df_columns + table_list.append(table) - st.session_state["new_tables"] = pd.concat( - asset["tables"], ignore_index=True, sort=False - ) + st.session_state["new_tables"] = pd.concat( + table_list, ignore_index=True, sort=False + ) def save_merge(table_extractor: str) -> None: @@ -83,16 +86,13 @@ def save_merge(table_extractor: str) -> None: ) if table_extractor is not None: - for asset in st.session_state["assets"]["table_extractors"]: - i = 0 - if asset["type"] == table_extractor: - for table in asset["tables"]: - st.markdown("Table shape :" + str(table.shape)) - st.markdown("Table index : _" + str(i)) - i += 1 - st.dataframe( - table, - ) + for key, table in st.session_state["tables"].items(): + if table_extractor in key: + st.markdown("Table shape :" + str(table.shape)) + st.markdown("Table name : " + key) + st.dataframe( + table, + ) with col2: st.markdown( diff --git a/app/pages/3_Clean_Headers.py b/app/pages/3_Clean_Headers.py index c9bc0fc..d6a19f7 100644 --- a/app/pages/3_Clean_Headers.py +++ b/app/pages/3_Clean_Headers.py @@ -111,10 +111,8 @@ def set_headers(algorithm_name: str) -> None: st.markdown("# Current extraction") st.markdown("The extracted table is displaye below") - df = st.data_editor( + st.dataframe( st.session_state.tables[st.session_state["algorithm_name"]], - num_rows="dynamic", width=900, height=900, - disabled=True, ) diff --git a/app/pages/4_Clean_Tables.py b/app/pages/4_Clean_Tables.py index c95a53b..6bf6569 100644 --- a/app/pages/4_Clean_Tables.py +++ b/app/pages/4_Clean_Tables.py @@ -55,7 +55,7 @@ def convert_dataframe(dataframe: pd.DataFrame) -> pd.DataFrame: return dataframe -special_characters = "#&()[]@" +special_characters = "#&()[]@©" def style_symbol(v, props=""): @@ -82,6 +82,8 @@ def update_min(string, min_distance, most_similar, input_string=input_string): else: return min_distance, most_similar + if input_string == None: + return "None" min_distance = float("inf") most_similar = None for string in JURIDICTIONS.keys(): @@ -110,10 +112,6 @@ def update_min(string, min_distance, most_similar, input_string=input_string): and "pdf_after_page_validation" in st.session_state ): - st.session_state.tables[st.session_state["algorithm_name"]] = convert_dataframe( - st.session_state.tables[st.session_state["algorithm_name"]] - ) - col3, col4 = st.columns(2) with col3: st.markdown( @@ -164,6 +162,11 @@ def update_min(string, min_distance, most_similar, input_string=input_string): dataframe = st.session_state.tables[st.session_state["algorithm_name"]].copy() + if country: + dataframe.iloc[:-2, 0] = dataframe.iloc[:-2, 0].apply( + lambda x: most_similar_string(x) + ) + if remove_symbols: pattern = "\(.*?\)" + "|[" + re.escape(special_characters) + "]" for column in dataframe.columns: @@ -178,11 +181,6 @@ def update_min(string, min_distance, most_similar, input_string=input_string): new_row.iloc[0] = "Total Calculated" dataframe.loc[-1] = new_row.transpose() - if country: - dataframe.iloc[:-2, 0] = dataframe.iloc[:-2, 0].apply( - lambda x: most_similar_string(x) - ) - dataframe_styler = dataframe.style if total: diff --git a/country_by_country/table_extraction/llama_parse_extractor.py b/country_by_country/table_extraction/llama_parse_extractor.py index 5d88d11..ec52ee0 100644 --- a/country_by_country/table_extraction/llama_parse_extractor.py +++ b/country_by_country/table_extraction/llama_parse_extractor.py @@ -55,6 +55,16 @@ def __call__(self, pdf_filepath: str) -> dict: for page in json_objs[0]["pages"]: for item in page["items"]: if item["type"] == "table": + # If the number of columns in the header row is greater than the data rows + header_length = len(item["rows"][0]) + + for i in range(1, len(item["rows"])): + while len(item["rows"][i]) < header_length: + item["rows"][i].append("No Extract ") + while len(item["rows"][i]) > header_length: + item["rows"][0].append("No Extract ") + header_length = len(item["rows"][0]) + df = pd.DataFrame(item["rows"][1:], columns=item["rows"][0]) tables_list.append(df) diff --git a/country_by_country/utils/utils.py b/country_by_country/utils/utils.py index 356809d..dd09e8d 100644 --- a/country_by_country/utils/utils.py +++ b/country_by_country/utils/utils.py @@ -65,28 +65,21 @@ def gather_tables( tables_by_name = {} for asset in assets["table_extractors"]: tables = asset["tables"] - if len(tables) == 1: - for column in tables[0].columns: + for i in range(len(tables)): + for label, _content in tables[i].items(): + if isinstance(tables[i][label], pd.DataFrame): + tables[i].columns = [ + "No Extract " + str(i + 1) for i in range(tables[i].shape[1]) + ] + break + for label, content in tables[i].items(): if ( - tables[0][column].dtype == "object" + content.dtype == "object" ): # Check if the column contains string data - tables[0][column] = tables[0][column].replace("", None) - tables[0][column] = tables[0][column].str.replace( - ",", - ".", - ) # else we wont be able to convert to float - tables[0][column] = tables[0][column].str.replace(".", "") - tables_by_name[asset["type"]] = tables[0] - elif len(tables) > 1: - for i in range(len(tables)): - for column in tables[i].columns: - if ( - tables[i][column].dtype == "object" - ): # Check if the column contains string data - tables[i][column] = tables[i][column].replace("", None) - tables[i][column] = tables[i][column].str.replace(",", ".") - tables[i][column] = tables[i][column].str.replace(".", "") - tables_by_name[asset["type"] + "_" + str(i)] = tables[i] + tables[i][label] = tables[i][label].replace("", None) + tables[i][label] = tables[i][label].str.replace(".", "") + tables[i][label] = tables[i][label].str.replace(",", ".") + tables_by_name[asset["type"] + "_" + str(i)] = tables[i] return tables_by_name From 64248c3cb5c8a3247a94961c87a86c6fcd8bee37 Mon Sep 17 00:00:00 2001 From: Ronan Date: Mon, 22 Apr 2024 22:17:22 +0200 Subject: [PATCH 3/4] fix: error with the validated button --- app/pages/4_Clean_Tables.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/app/pages/4_Clean_Tables.py b/app/pages/4_Clean_Tables.py index 5065001..d4a377f 100644 --- a/app/pages/4_Clean_Tables.py +++ b/app/pages/4_Clean_Tables.py @@ -100,6 +100,10 @@ def update_min(string, min_distance, most_similar, input_string=input_string): return most_similar +def validate(data: pd.DataFrame) -> None: + st.session_state.tables[st.session_state["algorithm_name"]] = data + + st.set_page_config(layout="wide", page_title="Tables customization") # page_icon="📈" st.title("Country by Country Tax Reporting analysis : Tables") st.subheader( @@ -239,21 +243,8 @@ def update_min(string, min_distance, most_similar, input_string=input_string): st.dataframe(dataframe_styler, use_container_width=True, height=1000) - validated = st.button( + st.button( "Save the table above", + on_click=validate, + args=(dataframe_styler.data,), ) - if validated: - st.session_state.tables[ - st.session_state["algorithm_name"] - ] = dataframe_styler.data - # This does not work - # Update the csv file to download as well - # print("clicked") - # st.session_state["df_csv_to_save"] = to_csv_file( - # st.session_state.tables[st.session_state["algorithm_name"]] - # ) - # We rather rerun , which reloads the page and updates the data - # to be downloaded - # Otherwise, if you click the download button, you get the previous data - # the first time and then the right data on the second click - st.rerun() From 76ba6521628783b3ab24ca63a190697c9282167a Mon Sep 17 00:00:00 2001 From: Ronan Date: Mon, 22 Apr 2024 22:42:46 +0200 Subject: [PATCH 4/4] fix: update of data_editor --- app/pages/4_Clean_Tables.py | 17 +++++++++++++++-- app/utils.py | 6 ------ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/app/pages/4_Clean_Tables.py b/app/pages/4_Clean_Tables.py index d4a377f..90d522f 100644 --- a/app/pages/4_Clean_Tables.py +++ b/app/pages/4_Clean_Tables.py @@ -1,5 +1,5 @@ import streamlit as st -from utils import set_algorithm_name, get_pdf_iframe, to_csv_file, update_df_csv_to_save +from utils import set_algorithm_name, get_pdf_iframe, to_csv_file from menu import display_pages_menu from country_by_country.utils.constants import JURIDICTIONS from Levenshtein import distance @@ -104,6 +104,18 @@ def validate(data: pd.DataFrame) -> None: st.session_state.tables[st.session_state["algorithm_name"]] = data +def update_df_csv_to_save() -> None: + for idx, change in st.session_state.changes["edited_rows"].items(): + for label, value in change.items(): + st.session_state.tables[st.session_state["algorithm_name"]].loc[ + idx, label + ] = value + + st.session_state["df_csv_to_save"] = to_csv_file( + st.session_state.tables[st.session_state["algorithm_name"]], + ) + + st.set_page_config(layout="wide", page_title="Tables customization") # page_icon="📈" st.title("Country by Country Tax Reporting analysis : Tables") st.subheader( @@ -158,10 +170,11 @@ def validate(data: pd.DataFrame) -> None: ), ) - st.session_state.tables[st.session_state["algorithm_name"]] = st.data_editor( + st.data_editor( st.session_state.tables[st.session_state["algorithm_name"]], num_rows="dynamic", on_change=update_df_csv_to_save, + key="changes", width=800, height=900, ) diff --git a/app/utils.py b/app/utils.py index f07e5e0..ca22dba 100644 --- a/app/utils.py +++ b/app/utils.py @@ -21,9 +21,3 @@ def set_algorithm_name(my_key: str) -> None: @st.cache_data def to_csv_file(df: pd.DataFrame) -> bytes: return df.to_csv(index=False).encode("utf-8") - - -def update_df_csv_to_save() -> None: - st.session_state["df_csv_to_save"] = to_csv_file( - st.session_state.tables[st.session_state["algorithm_name"]], - )