From 18772d2936e03c5c927b71e9c1e2982206dbff3f Mon Sep 17 00:00:00 2001 From: ubd725 Date: Tue, 23 Apr 2024 11:59:26 -0500 Subject: [PATCH] Categorical polars update --- .../profilers/categorical_column_profile.py | 24 ++++-- .../test_categorical_column_profile.py | 75 ++++++++++--------- 2 files changed, 54 insertions(+), 45 deletions(-) diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 1ca63090..23e05721 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -7,6 +7,7 @@ from typing import cast import datasketches +import polars as pl from pandas import DataFrame, Series from .. import dp_logging @@ -474,7 +475,7 @@ def _check_stop_condition_is_met(self, sample_size: int, unqiue_ratio: float): return True return False - def _update_stop_condition(self, data: DataFrame): + def _update_stop_condition(self, data: DataFrame | pl.DataFrame): """Return value stop_condition_is_met given stop conditions. :param data: Dataframe currently being processed by categorical profiler @@ -497,8 +498,8 @@ def _get_categories_cms(self, df_series, len_df): """Return count min sketch and heavy hitters for both the batch and stream case. :param df_series: Series currently being processed by categorical profiler - :type df_series: Series - :param len_df: the total number of samples iin df_series + :type df_series: polars.Series + :param len_df: the total number of samples in df_series :type len_df: int :return: cms, heavy_hitter_dict, missing_heavy_hitter_dict """ @@ -601,13 +602,13 @@ def _get_categories_full(self, df_series) -> dict: :return: dict of counts for each unique value :rtype: dict """ - category_count: dict = df_series.value_counts(dropna=False).to_dict() + category_count: dict = Series(df_series).value_counts(dropna=False).to_dict() return category_count @BaseColumnProfiler._timeit(name="categories") def _update_categories( self, - df_series: DataFrame, + df_series: DataFrame | pl.DataFrame, prev_dependent_properties: dict = None, subset_properties: dict = None, ) -> None: @@ -657,7 +658,9 @@ def _update_categories( if self._stop_condition_is_met: self._categories = {} - def _update_helper(self, df_series_clean: Series, profile: dict) -> None: + def _update_helper( + self, df_series_clean: Series | pl.Series, profile: dict + ) -> None: """ Update col profile properties with clean dataset and its known profile. @@ -669,7 +672,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None: """ self._update_column_base_properties(profile) - def update(self, df_series: Series) -> CategoricalColumn: + def update(self, df_series: pl.Series | Series) -> CategoricalColumn: """ Update the column profile. @@ -682,12 +685,17 @@ def update(self, df_series: Series) -> CategoricalColumn: if len(df_series) == 0 or self._stop_condition_is_met: return self + if isinstance(df_series, pl.Series): + pandas_df = df_series.to_pandas() + else: + pandas_df = df_series + profile = dict(sample_size=len(df_series)) CategoricalColumn._update_categories(self, df_series) BaseColumnProfiler._perform_property_calcs( self, self.__calculations, - df_series=df_series, + df_series=pandas_df, prev_dependent_properties={}, subset_properties=profile, ) diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 55d2ea68..5a403dec 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +import polars as pl from dataprofiler.profilers import CategoricalColumn from dataprofiler.profilers.json_decoder import load_column_profile @@ -51,7 +52,7 @@ def test_correct_categorical_model_string(self): self.assertCountEqual(categories, profile.categories) def test_stop_condition_is_met_initially(self): - dataset = pd.Series(["a"] * 10 + ["b"] * 10 + ["c"] * 10 + ["d"] * 10) + dataset = pl.Series(["a"] * 10 + ["b"] * 10 + ["c"] * 10 + ["d"] * 10) profile = CategoricalColumn("test dataset") profile.max_sample_size_to_check_stop_condition = 0 profile.stop_condition_unique_value_ratio = 0 @@ -368,7 +369,7 @@ def test_categorical_mapping(self): self.assertNotEqual(num_nan_count, len(column_profile.null_types_index["NaN"])) def test_true_categorical_report(self): - df_categorical = pd.Series( + df_categorical = pl.Series( [ "a", "a", @@ -415,7 +416,7 @@ def test_true_categorical_report(self): self.assertEqual(report, expected_profile) def test_false_categorical_report(self): - df_non_categorical = pd.Series(list(map(str, range(0, 20)))) + df_non_categorical = pl.Series(list(map(str, range(0, 20)))) profile = CategoricalColumn(df_non_categorical.name) profile.update(df_non_categorical) @@ -433,7 +434,7 @@ def test_false_categorical_report(self): self.assertEqual(report, expected_profile) def test_report(self): - df_non_categorical = pd.Series(list(map(str, range(0, 20)))) + df_non_categorical = pl.Series(list(map(str, range(0, 20)))) profile = CategoricalColumn(df_non_categorical.name) profile.update(df_non_categorical) @@ -681,32 +682,32 @@ def test_categorical_merge(self): def test_gini_impurity(self): # Normal test - df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"]) + df_categorical = pl.Series(["y", "y", "y", "y", "n", "n", "n"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) expected_val = ((4 / 7) * (3 / 7)) + ((4 / 7) * (3 / 7)) self.assertAlmostEqual(profile.gini_impurity, expected_val) # One class only test - df_categorical = pd.Series(["y", "y", "y", "y", "y", "y", "y"]) + df_categorical = pl.Series(["y", "y", "y", "y", "y", "y", "y"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) expected_val = 0 self.assertEqual(profile.gini_impurity, expected_val) # Empty test - df_categorical = pd.Series([]) + df_categorical = pl.Series([]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(profile.gini_impurity, None) def test_categorical_diff(self): # test psi new category in another profile - df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"]) + df_categorical = pl.Series(["y", "y", "y", "y", "n", "n", "n"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) - df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"]) + df_categorical = pl.Series(["y", "maybe", "y", "y", "n", "n", "maybe"]) profile2 = CategoricalColumn(df_categorical.name) profile2.update(df_categorical) @@ -734,7 +735,7 @@ def test_categorical_diff(self): self.assertDictEqual(expected_diff, actual_diff) # Test with one categorical column matching - df_not_categorical = pd.Series( + df_not_categorical = pl.Series( [ "THIS", "is", @@ -759,11 +760,11 @@ def test_categorical_diff(self): self.assertDictEqual(expected_diff, profile.diff(profile2)) # Test diff with psi enabled - df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"]) + df_categorical = pl.Series(["y", "y", "y", "y", "n", "n", "n", "maybe"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) - df_categorical = pd.Series(["y", "maybe", "y", "y", "n", "n", "maybe"]) + df_categorical = pl.Series(["y", "maybe", "y", "y", "n", "n", "maybe"]) profile2 = CategoricalColumn(df_categorical.name) profile2.update(df_categorical) @@ -787,32 +788,32 @@ def test_categorical_diff(self): self.assertDictEqual(expected_diff, profile.diff(profile2)) def test_unalikeability(self): - df_categorical = pd.Series(["a", "a"]) + df_categorical = pl.Series(["a", "a"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(profile.unalikeability, 0) - df_categorical = pd.Series(["a", "c", "b"]) + df_categorical = pl.Series(["a", "c", "b"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(profile.unalikeability, 1) - df_categorical = pd.Series(["a", "a", "a", "b", "b", "b"]) + df_categorical = pl.Series(["a", "a", "a", "b", "b", "b"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(profile.unalikeability, 18 / 30) - df_categorical = pd.Series(["a", "a", "b", "b", "b", "a", "c", "c", "a", "a"]) + df_categorical = pl.Series(["a", "a", "b", "b", "b", "a", "c", "c", "a", "a"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(profile.unalikeability, 2 * (10 + 15 + 6) / 90) - df_categorical = pd.Series(["a"]) + df_categorical = pl.Series(["a"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(0, profile.unalikeability) - df_categorical = pd.Series([]) + df_categorical = pl.Series([]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) self.assertEqual(None, profile.unalikeability) @@ -820,7 +821,7 @@ def test_unalikeability(self): def test_top_k_categories_change(self): # Test if top_k_categories is None options = CategoricalOptions() - df_series = pd.Series(["a", "a", "b", "c", "d", "e", "e", "e", "f", "g"]) + df_series = pl.Series(["a", "a", "b", "c", "d", "e", "e", "e", "f", "g"]) profile = CategoricalColumn(df_series.name, options) profile.update(df_series) self.assertEqual(len(profile.profile["statistics"]["categorical_count"]), 7) @@ -831,7 +832,7 @@ def test_top_k_categories_change(self): # Test if top_k_categories is greater than the count of categories options.top_k_categories = 6 - df_series = pd.Series(["a", "a", "b", "c", "d"]) + df_series = pl.Series(["a", "a", "b", "c", "d"]) profile = CategoricalColumn(df_series.name, options) profile.update(df_series) self.assertEqual(len(profile.profile["statistics"]["categorical_count"]), 4) @@ -947,7 +948,7 @@ def test_json_decode_after_update(self): # Actual deserialization # Build expected CategoricalColumn - df_categorical = pd.Series( + df_categorical = pl.Series( [ "a", "a", @@ -973,7 +974,7 @@ def test_json_decode_after_update(self): test_utils.assert_profiles_equal(deserialized, expected_profile) - df_categorical = pd.Series( + df_categorical = pl.Series( [ "a", # add existing "d", # add new @@ -987,7 +988,7 @@ def test_json_decode_after_update(self): assert deserialized.categorical_counts == {"c": 5, "b": 4, "a": 4, "d": 1} def test_cms_max_num_heavy_hitters(self): - df_categorical = pd.Series(["a"] * 5 + ["b"] * 5 + ["c"] * 10) + df_categorical = pl.Series(["a"] * 5 + ["b"] * 5 + ["c"] * 10) options = CategoricalOptions() options.cms = True @@ -1002,8 +1003,8 @@ def test_cms_max_num_heavy_hitters(self): self.assertTrue(profile.sample_size >= 10) def test_cms_update_hybrid_batch_stream(self): - dataset = pd.Series(["a"] * 7 + ["b"] * 9 + ["c"] * 14) - dataset1 = pd.Series(["a"] * 9 + ["b"] * 11 + ["c"] * 9 + ["d"] * 1) + dataset = pl.Series(["a"] * 7 + ["b"] * 9 + ["c"] * 14) + dataset1 = pl.Series(["a"] * 9 + ["b"] * 11 + ["c"] * 9 + ["d"] * 1) options = CategoricalOptions() options.cms = True @@ -1031,8 +1032,8 @@ def test_cms_update_hybrid_batch_stream(self): def test_cms_profile_merge_via_add(self): - dataset = pd.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9) - dataset1 = pd.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14) + dataset = pl.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9) + dataset1 = pl.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14) expected_categories = ["b", "c"] expected_categories_dict = {"b": 22, "c": 23} @@ -1074,8 +1075,8 @@ def test_cms_profile_merge_via_add(self): def test_cms_profile_min_max_num_heavy_hitters(self): - dataset = pd.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9) - dataset1 = pd.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14) + dataset = pl.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9) + dataset1 = pl.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14) options = CategoricalOptions() options.cms = True @@ -1097,8 +1098,8 @@ def test_cms_profile_min_max_num_heavy_hitters(self): def test_cms_catch_overwriting_with_missing_dict(self): - dataset = pd.Series(["b"] * 2 + ["c"] * 14) - dataset1 = pd.Series(["b"] * 5 + ["c"] * 10) + dataset = pl.Series(["b"] * 2 + ["c"] * 14) + dataset1 = pl.Series(["b"] * 5 + ["c"] * 10) options = CategoricalOptions() options.cms = True @@ -1126,7 +1127,7 @@ def test_cms_catch_overwriting_with_missing_dict(self): def test_cms_vs_full_mismatch_merge(self): - dataset = pd.Series(["b"] * 2 + ["c"] * 14) + dataset = pl.Series(["b"] * 2 + ["c"] * 14) options = CategoricalOptions() options.cms = True @@ -1176,7 +1177,7 @@ def test_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL(self): ] len_unique = len(set(cat_sentence_list)) - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category" @@ -1200,7 +1201,7 @@ def test_greater_than_CATEGORICAL_THRESHOLD_DEFAULT_identify_as_text(self): ) cat_sentence_list = list_unique_values * num_sentences - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category" @@ -1226,7 +1227,7 @@ def test_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): cat_sentence_list = list_unique_values * num_sentences len_unique = len(set(cat_sentence_list)) - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category" @@ -1255,7 +1256,7 @@ def test_uppercase_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): cat_sentence_list[-3] = self.test_sentence_upper3 + str(num_sentences - 2) len_unique = len(set(cat_sentence_list)) - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category" @@ -1279,7 +1280,7 @@ def test_long_sentences_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORI ] len_unique = len(set(cat_sentence_list)) - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category"