From 0751d8600b3cebad383065a2987bdfff19d940af Mon Sep 17 00:00:00 2001
From: Andrew Li <atl15c02@gmail.com>
Date: Tue, 23 Apr 2024 17:06:36 -0500
Subject: [PATCH] update categorical profiler

---
 .../profilers/categorical_column_profile.py   |  10 +-
 .../test_categorical_column_profile.py        | 104 +++++++++---------
 2 files changed, 63 insertions(+), 51 deletions(-)

diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
index 1ca63090..708a7448 100644
--- a/dataprofiler/profilers/categorical_column_profile.py
+++ b/dataprofiler/profilers/categorical_column_profile.py
@@ -7,7 +7,9 @@
 from typing import cast
 
 import datasketches
-from pandas import DataFrame, Series
+import pandas as pd
+import polars as pl
+from polars import DataFrame, Series
 
 from .. import dp_logging
 from . import profiler_utils
@@ -601,7 +603,8 @@ def _get_categories_full(self, df_series) -> dict:
         :return: dict of counts for each unique value
         :rtype: dict
         """
-        category_count: dict = df_series.value_counts(dropna=False).to_dict()
+        value_counts = df_series.value_counts(sort=True)
+        category_count: dict = dict(value_counts.iter_rows())
         return category_count
 
     @BaseColumnProfiler._timeit(name="categories")
@@ -678,6 +681,9 @@ def update(self, df_series: Series) -> CategoricalColumn:
         :return: updated CategoricalColumn
         :rtype: CategoricalColumn
         """
+        # TODO remove onces profiler builder is updated
+        if type(df_series) == pd.Series:
+            df_series = pl.from_pandas(df_series)  # type: ignore
         # If condition for limiting profile calculations
         if len(df_series) == 0 or self._stop_condition_is_met:
             return self
diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py
index 55d2ea68..f3255a85 100644
--- a/dataprofiler/tests/profilers/test_categorical_column_profile.py
+++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+import polars as pl
 
 from dataprofiler.profilers import CategoricalColumn
 from dataprofiler.profilers.json_decoder import load_column_profile
@@ -19,6 +20,8 @@
 
 
 class TestCategoricalColumn(unittest.TestCase):
+    maxDiff = None
+
     @classmethod
     def setUp(self):
         test_utils.set_seed(seed=0)
@@ -29,10 +32,12 @@ def setUpClass(cls):
             test_root_path, "data", "csv/aws_honeypot_marx_geo.csv"
         )
         columns_to_read = ["host", "localeabbr"]
-        cls.aws_dataset = pd.read_csv(cls.input_file_path)[columns_to_read]
+        cls.aws_dataset = pl.read_csv(cls.input_file_path, infer_schema_length=0)[
+            columns_to_read
+        ]
 
     def test_correct_categorical_model_string(self):
-        dataset = self.aws_dataset["host"].dropna()
+        dataset = self.aws_dataset["host"].drop_nulls()
         profile = CategoricalColumn(dataset.name)
         profile.update(dataset)
         self.assertEqual(1.0, profile.is_match)
@@ -51,7 +56,7 @@ def test_correct_categorical_model_string(self):
         self.assertCountEqual(categories, profile.categories)
 
     def test_stop_condition_is_met_initially(self):
-        dataset = pd.Series(["a"] * 10 + ["b"] * 10 + ["c"] * 10 + ["d"] * 10)
+        dataset = pl.Series(["a"] * 10 + ["b"] * 10 + ["c"] * 10 + ["d"] * 10)
         profile = CategoricalColumn("test dataset")
         profile.max_sample_size_to_check_stop_condition = 0
         profile.stop_condition_unique_value_ratio = 0
@@ -64,7 +69,7 @@ def test_stop_condition_is_met_initially(self):
         self.assertFalse(profile.is_match)
 
     def test_stop_condition_is_met_after_initial_profile(self):
-        dataset = pd.Series(["a"] * 10 + ["b"] * 10 + ["c"] * 10 + ["d"] * 10)
+        dataset = pl.Series(["a"] * 10 + ["b"] * 10 + ["c"] * 10 + ["d"] * 10)
         profile = CategoricalColumn("test dataset")
         profile.max_sample_size_to_check_stop_condition = len(dataset) + 1
         profile.stop_condition_unique_value_ratio = 0
@@ -72,7 +77,7 @@ def test_stop_condition_is_met_after_initial_profile(self):
 
         self.assertFalse(profile._stop_condition_is_met)
 
-        dataset.loc[len(dataset.index)] = "Testing past ratio"
+        dataset.append(pl.Series(["Testing past ratio"]))
         profile.update(dataset)
 
         self.assertTrue(profile._stop_condition_is_met)
@@ -90,7 +95,7 @@ def test_stop_condition_is_met_after_initial_profile(self):
         self.assertFalse(profile.is_match)
 
     def test_timeit_profile(self):
-        dataset = self.aws_dataset["host"].dropna()
+        dataset = self.aws_dataset["host"].drop_nulls()
         profile = CategoricalColumn(dataset.name)
 
         time_array = [float(x) for x in range(17, 0, -1)]
@@ -109,7 +114,7 @@ def test_timeit_profile(self):
             self.assertEqual(expected, profile.profile["times"])
 
     def test_mixed_categorical_col_integer_string(self):
-        dataset = self.aws_dataset["localeabbr"].dropna()
+        dataset = self.aws_dataset["localeabbr"].drop_nulls()
         profile = CategoricalColumn(dataset.name)
         profile.update(dataset)
 
@@ -368,7 +373,7 @@ def test_categorical_mapping(self):
         self.assertNotEqual(num_nan_count, len(column_profile.null_types_index["NaN"]))
 
     def test_true_categorical_report(self):
-        df_categorical = pd.Series(
+        df_categorical = pl.Series(
             [
                 "a",
                 "a",
@@ -415,7 +420,7 @@ def test_true_categorical_report(self):
         self.assertEqual(report, expected_profile)
 
     def test_false_categorical_report(self):
-        df_non_categorical = pd.Series(list(map(str, range(0, 20))))
+        df_non_categorical = pl.Series(list(map(str, range(0, 20))))
         profile = CategoricalColumn(df_non_categorical.name)
         profile.update(df_non_categorical)
 
@@ -433,7 +438,7 @@ def test_false_categorical_report(self):
         self.assertEqual(report, expected_profile)
 
     def test_report(self):
-        df_non_categorical = pd.Series(list(map(str, range(0, 20))))
+        df_non_categorical = pl.Series(list(map(str, range(0, 20))))
         profile = CategoricalColumn(df_non_categorical.name)
         profile.update(df_non_categorical)
 
@@ -444,10 +449,10 @@ def test_report(self):
         self.assertDictEqual(report1, report3)
 
     def test_categorical_merge(self):
-        df1 = pd.Series(
-            ["abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2", np.nan]
+        df1 = pl.Series(
+            ["abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2", None]
         )
-        df2 = pd.Series(
+        df2 = pl.Series(
             ["1", "null", "ee", "NaN", "ff", "nan", "gg", "None", "aa", "b", "ee"]
         )
 
@@ -460,7 +465,7 @@ def test_categorical_merge(self):
             "3",
             "2",
             "dfd",
-            np.nan,
+            None,
             "1",
             "null",
             "ee",
@@ -482,7 +487,7 @@ def test_categorical_merge(self):
             "3": 1,
             "2": 2,
             "dfd": 1,
-            np.nan: 1,
+            None: 1,
         }
         self.assertDictEqual(expected_dict, profile._categories)
 
@@ -503,7 +508,7 @@ def test_categorical_merge(self):
             "4": 1,
             "3": 1,
             "2": 2,
-            np.nan: 1,
+            None: 1,
             "dfd": 1,
             "1": 1,
             "ee": 2,
@@ -581,7 +586,7 @@ def test_categorical_merge(self):
                 "abcd",
                 "aa",
                 "2",
-                np.nan,
+                None,
                 "4",
                 "b",
                 "3",
@@ -601,7 +606,7 @@ def test_categorical_merge(self):
             "2": 4,
             "abcd": 4,
             "b": 3,
-            np.nan: 2,
+            None: 2,
             "dfd": 2,
             "3": 2,
             "4": 2,
@@ -681,32 +686,32 @@ def test_categorical_merge(self):
 
     def test_gini_impurity(self):
         # Normal test
-        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
+        df_categorical = pl.Series(["y", "y", "y", "y", "n", "n", "n"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         expected_val = ((4 / 7) * (3 / 7)) + ((4 / 7) * (3 / 7))
         self.assertAlmostEqual(profile.gini_impurity, expected_val)
 
         # One class only test
-        df_categorical = pd.Series(["y", "y", "y", "y", "y", "y", "y"])
+        df_categorical = pl.Series(["y", "y", "y", "y", "y", "y", "y"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         expected_val = 0
         self.assertEqual(profile.gini_impurity, expected_val)
 
         # Empty test
-        df_categorical = pd.Series([])
+        df_categorical = pl.Series([])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(profile.gini_impurity, None)
 
     def test_categorical_diff(self):
         # test psi new category in another profile
-        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
+        df_categorical = pl.Series(["y", "y", "y", "y", "n", "n", "n"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
 
-        df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
+        df_categorical = pl.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
         profile2 = CategoricalColumn(df_categorical.name)
         profile2.update(df_categorical)
 
@@ -734,7 +739,7 @@ def test_categorical_diff(self):
         self.assertDictEqual(expected_diff, actual_diff)
 
         # Test with one categorical column matching
-        df_not_categorical = pd.Series(
+        df_not_categorical = pl.Series(
             [
                 "THIS",
                 "is",
@@ -759,11 +764,11 @@ def test_categorical_diff(self):
         self.assertDictEqual(expected_diff, profile.diff(profile2))
 
         # Test diff with psi enabled
-        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"])
+        df_categorical = pl.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
 
-        df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
+        df_categorical = pl.Series(["y", "maybe", "y", "y", "n", "n", "maybe"])
         profile2 = CategoricalColumn(df_categorical.name)
         profile2.update(df_categorical)
 
@@ -787,32 +792,32 @@ def test_categorical_diff(self):
         self.assertDictEqual(expected_diff, profile.diff(profile2))
 
     def test_unalikeability(self):
-        df_categorical = pd.Series(["a", "a"])
+        df_categorical = pl.Series(["a", "a"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(profile.unalikeability, 0)
 
-        df_categorical = pd.Series(["a", "c", "b"])
+        df_categorical = pl.Series(["a", "c", "b"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(profile.unalikeability, 1)
 
-        df_categorical = pd.Series(["a", "a", "a", "b", "b", "b"])
+        df_categorical = pl.Series(["a", "a", "a", "b", "b", "b"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(profile.unalikeability, 18 / 30)
 
-        df_categorical = pd.Series(["a", "a", "b", "b", "b", "a", "c", "c", "a", "a"])
+        df_categorical = pl.Series(["a", "a", "b", "b", "b", "a", "c", "c", "a", "a"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(profile.unalikeability, 2 * (10 + 15 + 6) / 90)
 
-        df_categorical = pd.Series(["a"])
+        df_categorical = pl.Series(["a"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(0, profile.unalikeability)
 
-        df_categorical = pd.Series([])
+        df_categorical = pl.Series([])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         self.assertEqual(None, profile.unalikeability)
@@ -820,7 +825,7 @@ def test_unalikeability(self):
     def test_top_k_categories_change(self):
         # Test if top_k_categories is None
         options = CategoricalOptions()
-        df_series = pd.Series(["a", "a", "b", "c", "d", "e", "e", "e", "f", "g"])
+        df_series = pl.Series(["a", "a", "b", "c", "d", "e", "e", "e", "f", "g"])
         profile = CategoricalColumn(df_series.name, options)
         profile.update(df_series)
         self.assertEqual(len(profile.profile["statistics"]["categorical_count"]), 7)
@@ -831,7 +836,7 @@ def test_top_k_categories_change(self):
 
         # Test if top_k_categories is greater than the count of categories
         options.top_k_categories = 6
-        df_series = pd.Series(["a", "a", "b", "c", "d"])
+        df_series = pl.Series(["a", "a", "b", "c", "d"])
         profile = CategoricalColumn(df_series.name, options)
         profile.update(df_series)
         self.assertEqual(len(profile.profile["statistics"]["categorical_count"]), 4)
@@ -883,7 +888,8 @@ def test_json_encode(self):
         self.assertEqual(serialized, expected)
 
     def test_json_encode_after_update(self):
-        df_categorical = pd.Series(
+        df_categorical = pl.Series(
+            None,
             [
                 "a",
                 "a",
@@ -897,7 +903,7 @@ def test_json_encode_after_update(self):
                 "c",
                 "c",
                 "c",
-            ]
+            ],
         )
         profile = CategoricalColumn(df_categorical.name)
 
@@ -909,7 +915,7 @@ def test_json_encode_after_update(self):
             {
                 "class": "CategoricalColumn",
                 "data": {
-                    "name": None,
+                    "name": "",
                     "col_index": np.nan,
                     "sample_size": 12,
                     "metadata": {},
@@ -947,7 +953,7 @@ def test_json_decode_after_update(self):
         # Actual deserialization
 
         # Build expected CategoricalColumn
-        df_categorical = pd.Series(
+        df_categorical = pl.Series(
             [
                 "a",
                 "a",
@@ -973,7 +979,7 @@ def test_json_decode_after_update(self):
 
         test_utils.assert_profiles_equal(deserialized, expected_profile)
 
-        df_categorical = pd.Series(
+        df_categorical = pl.Series(
             [
                 "a",  # add existing
                 "d",  # add new
@@ -987,7 +993,7 @@ def test_json_decode_after_update(self):
         assert deserialized.categorical_counts == {"c": 5, "b": 4, "a": 4, "d": 1}
 
     def test_cms_max_num_heavy_hitters(self):
-        df_categorical = pd.Series(["a"] * 5 + ["b"] * 5 + ["c"] * 10)
+        df_categorical = pl.Series(["a"] * 5 + ["b"] * 5 + ["c"] * 10)
 
         options = CategoricalOptions()
         options.cms = True
@@ -1002,8 +1008,8 @@ def test_cms_max_num_heavy_hitters(self):
         self.assertTrue(profile.sample_size >= 10)
 
     def test_cms_update_hybrid_batch_stream(self):
-        dataset = pd.Series(["a"] * 7 + ["b"] * 9 + ["c"] * 14)
-        dataset1 = pd.Series(["a"] * 9 + ["b"] * 11 + ["c"] * 9 + ["d"] * 1)
+        dataset = pl.Series(["a"] * 7 + ["b"] * 9 + ["c"] * 14)
+        dataset1 = pl.Series(["a"] * 9 + ["b"] * 11 + ["c"] * 9 + ["d"] * 1)
 
         options = CategoricalOptions()
         options.cms = True
@@ -1031,8 +1037,8 @@ def test_cms_update_hybrid_batch_stream(self):
 
     def test_cms_profile_merge_via_add(self):
 
-        dataset = pd.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9)
-        dataset1 = pd.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14)
+        dataset = pl.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9)
+        dataset1 = pl.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14)
 
         expected_categories = ["b", "c"]
         expected_categories_dict = {"b": 22, "c": 23}
@@ -1074,8 +1080,8 @@ def test_cms_profile_merge_via_add(self):
 
     def test_cms_profile_min_max_num_heavy_hitters(self):
 
-        dataset = pd.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9)
-        dataset1 = pd.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14)
+        dataset = pl.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9)
+        dataset1 = pl.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14)
 
         options = CategoricalOptions()
         options.cms = True
@@ -1097,8 +1103,8 @@ def test_cms_profile_min_max_num_heavy_hitters(self):
 
     def test_cms_catch_overwriting_with_missing_dict(self):
 
-        dataset = pd.Series(["b"] * 2 + ["c"] * 14)
-        dataset1 = pd.Series(["b"] * 5 + ["c"] * 10)
+        dataset = pl.Series(["b"] * 2 + ["c"] * 14)
+        dataset1 = pl.Series(["b"] * 5 + ["c"] * 10)
 
         options = CategoricalOptions()
         options.cms = True
@@ -1126,7 +1132,7 @@ def test_cms_catch_overwriting_with_missing_dict(self):
 
     def test_cms_vs_full_mismatch_merge(self):
 
-        dataset = pd.Series(["b"] * 2 + ["c"] * 14)
+        dataset = pl.Series(["b"] * 2 + ["c"] * 14)
 
         options = CategoricalOptions()
         options.cms = True