From c4749ec170d551f25631ac3485919cfd1a247448 Mon Sep 17 00:00:00 2001
From: Ronan <ronan_2@hotmail.com>
Date: Mon, 6 May 2024 11:48:47 +0200
Subject: [PATCH] feat: add new filters rules

---
 app/pages/2_Metadata.py     |  11 ++++
 app/pages/5_Clean_Tables.py | 112 ++++++++++++++++++++++++++++++++----
 2 files changed, 111 insertions(+), 12 deletions(-)

diff --git a/app/pages/2_Metadata.py b/app/pages/2_Metadata.py
index e1ce0a4..f07a309 100644
--- a/app/pages/2_Metadata.py
+++ b/app/pages/2_Metadata.py
@@ -41,6 +41,7 @@
                 currency = st.session_state["metadata"]["currency"]
                 unit = st.session_state["metadata"]["unit"]
                 headquarter = st.session_state["metadata"]["headquarter"]
+                decimal_separator = st.session_state["metadata"]["separator"]
             else:
                 company_name = None
                 sector = None
@@ -48,6 +49,15 @@
                 currency = None
                 unit = None
                 headquarter = ""
+                decimal_separator = ","
+
+            separator_list = [",", "."]
+            decimal_separator = st.selectbox(
+                "Decimal separator",
+                separator_list,
+                index=separator_list.index(decimal_separator),
+            )
+
             companies = list(COMPANIES.keys())
             company_name = st.selectbox(
                 "Company name",
@@ -100,6 +110,7 @@
             )
             if submitted:
                 st.session_state["metadata"] = {
+                    "separator": decimal_separator,
                     "company_name": company_name,
                     "sector": sector,
                     "year": year,
diff --git a/app/pages/5_Clean_Tables.py b/app/pages/5_Clean_Tables.py
index 97e1338..a3a9573 100644
--- a/app/pages/5_Clean_Tables.py
+++ b/app/pages/5_Clean_Tables.py
@@ -50,12 +50,12 @@ def convert_dataframe(dataframe: pd.DataFrame) -> pd.DataFrame:
     for column_name in dataframe.columns:
         try:
             dataframe[column_name] = dataframe[column_name].astype(float)
-        except Exception:
+        except Exception as e:
             pass
     return dataframe
 
 
-special_characters = "#&()[]@©"
+special_characters = "#&()[]@©€$'R¹³²"
 
 
 def style_symbol(v, props=""):
@@ -181,25 +181,52 @@ def update_df_csv_to_save() -> None:
             height=900,
         )
 
+    st.subheader(
+        "Filters : ",
+    )
+
     col7, col8, col9 = st.columns([1, 1, 1])
     with col7:
         total = st.checkbox(
             "Calculate the Total of each columns, excluding the last row", value=True
         )
         country = st.checkbox("Activate the country filter", value=True)
+        decimal_cleanup = st.checkbox("Apply decimal cleanup")
 
     with col8:
         negativ = st.checkbox(
             "Show the negative numbers, for each columns detected as a numerical type"
         )
+
+        with st.container(border=True):
+            cleanup_rules = st.checkbox(
+                "Apply clean up rules : (number) mean a negative number, o-> 0, homogenization NA, ect ect "
+            )
+            if cleanup_rules:
+                cleanup_excluded = st.multiselect(
+                    "exclude from filtering",
+                    st.session_state.tables[st.session_state["algorithm_name"]].columns,
+                    key="cleanup",
+                )
+
     with col9:
-        symbol = st.checkbox(
-            "Show the cells that contain a special symbol : " + special_characters,
-            value=True,
-        )
-        remove_symbols = st.checkbox("Remove the special symbols")
+        with st.container(border=True):
+            symbol = st.checkbox(
+                "Show the cells that contain a special symbol : " + special_characters,
+                value=True,
+            )
+            remove_symbols = st.checkbox(
+                "Remove the special symbols on numeric columns"
+            )
+            if remove_symbols:
+                rm_symbol_excluded = st.multiselect(
+                    "exclude from filtering",
+                    st.session_state.tables[st.session_state["algorithm_name"]].columns,
+                    key="rm_symbol",
+                )
 
     dataframe = st.session_state.tables[st.session_state["algorithm_name"]].copy()
+    dataframe = convert_dataframe(dataframe)
 
     if country:
         dataframe.iloc[:-2, 0] = dataframe.iloc[:-2, 0].apply(
@@ -207,13 +234,74 @@ def update_df_csv_to_save() -> None:
         )
 
     if remove_symbols:
-        pattern = "\(.*?\)" + "|[" + re.escape(special_characters) + "]"
-        for column in dataframe.columns:
-            dataframe[column] = dataframe[column].apply(
-                lambda x: re.sub(pattern, "", str(x))
-            )
+        pattern = "[" + re.escape(special_characters) + "]"
+        for column, dtype in dataframe.dtypes.items():
+            if column not in rm_symbol_excluded:
+                dataframe[column] = dataframe[column].apply(
+                    lambda x: re.sub(pattern, "", str(x))
+                )
         dataframe = convert_dataframe(dataframe)
 
+    if cleanup_rules:
+        for column, dtype in dataframe.dtypes.items():
+            if column not in cleanup_excluded:
+                # this is a code translated by chatgpt from Kane's R code
+                dataframe[column] = dataframe[column].replace(
+                    {"^-$|^$|^ $|^N/I$|^- -$|^N/A$|^n\\.a\\.$": None}, regex=True
+                )
+                dataframe[column] = dataframe[column].replace(
+                    {"^o$|^O$|^\\(o\\)$|^\\(O\\)$|^\\(0\\)$": "0"}, regex=True
+                )
+
+                if dtype == object:
+                    dataframe[column] = dataframe[column].str.replace(
+                        "(\\(.*\\))[:alnum:]+", "\\1", regex=True
+                    )
+                    dataframe[column] = dataframe[column].str.replace(
+                        "\\([:alnum:]+$|\\)[:alnum:]+$", "", regex=True
+                    )
+                    dataframe[column] = dataframe[column].str.replace(
+                        "\\([:alpha:]+\\)", "", regex=True
+                    )
+                    dataframe[column] = dataframe[column].str.replace(
+                        "(.+)\\(.+\\)$", "\\1", regex=True
+                    )
+                    dataframe[column] = dataframe[column].str.replace(
+                        "^\\(-(.*)\\)", "-\\1", regex=True
+                    )
+                    dataframe[column] = dataframe[column].str.replace(
+                        "^\\((.*)\\)", "-\\1", regex=True
+                    )
+                    dataframe[column] = dataframe[column].str.replace(
+                        "\\(.*\\)| |\\*|^-$|\\[.*\\]|^-€$", "", regex=True
+                    )
+        dataframe = convert_dataframe(dataframe)
+    if decimal_cleanup:
+        decimal_separator = (
+            st.session_state["metadata"]["separator"]
+            if st.session_state["metadata"]["separator"]
+            else ","
+        )
+        for column, dtype in dataframe.dtypes.items():
+            if dtype == object:
+                if decimal_separator == ",":
+                    dataframe[column] = dataframe[column].str.replace(
+                        "\\.", "", regex=False
+                    )
+                    dataframe[column] = dataframe[column].str.replace(
+                        ",", ".", regex=False
+                    )
+                else:
+                    dataframe[column] = dataframe[column].str.replace(
+                        ",(.{1,2})$", ".\\1", regex=True
+                    )
+                    dataframe[column] = dataframe[column].str.replace(
+                        "\\.([0-9]{3})", ",\\1", regex=True
+                    )
+                    dataframe[column] = dataframe[column].str.replace(
+                        ",", "", regex=False
+                    )
+
     if total:
         dataframe = convert_dataframe(dataframe)
         new_row = dataframe.apply(column_sum, axis=0)