Skip to content

Commit

Permalink
partial update to numerical_column_stats
Browse files Browse the repository at this point in the history
  • Loading branch information
atl1502 committed Jan 29, 2024
1 parent 1b3220b commit 66520cc
Showing 1 changed file with 30 additions and 8 deletions.
38 changes: 30 additions & 8 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import numpy as np
import numpy.typing as npt
import pandas as pd
import polars as pl
import scipy.stats

from . import float_column_profile, histogram_utils, profiler_utils
Expand Down Expand Up @@ -1125,10 +1126,12 @@ def _estimate_stats_from_histogram(self) -> np.float64:
def _total_histogram_bin_variance(
self, input_array: np.ndarray | pd.Series
) -> float:
if type(input_array) is pd.Series:
input_array = pl.from_pandas(input_array)
input_array = input_array.to_numpy()
# calculate total variance over all bins of a histogram
bin_counts = self._stored_histogram["histogram"]["bin_counts"]
bin_edges = self._stored_histogram["histogram"]["bin_edges"]

# account ofr digitize which is exclusive
bin_edges = bin_edges.copy()
bin_edges[-1] += 1e-3
Expand All @@ -1151,6 +1154,9 @@ def _histogram_bin_error(self, input_array: np.ndarray | pd.Series) -> np.float6
:return: binning error
:rtype: float
"""
if type(input_array) is pd.Series:
input_array = pl.from_pandas(input_array)
input_array = input_array.to_numpy()
bin_edges = self._stored_histogram["histogram"]["bin_edges"]

# account ofr digitize which is exclusive
Expand Down Expand Up @@ -1265,7 +1271,7 @@ def _histogram_to_array(self) -> np.ndarray:
return array_flatten

def _get_histogram(
self, values: np.ndarray | pd.Series
self, values: np.ndarray | pl.Series
) -> tuple[np.ndarray, np.ndarray]:
"""
Calculate stored histogram the suggested bin counts for each histogram method.
Expand All @@ -1281,7 +1287,7 @@ def _get_histogram(
if isinstance(values, (np.ndarray, list)):
unique_value = values[0]
else:
unique_value = values.iloc[0]
unique_value = values[0]
bin_edges = np.array([unique_value, unique_value])
for bin_method in self.histogram_bin_method_names:
self.histogram_methods[bin_method]["histogram"][
Expand Down Expand Up @@ -1322,6 +1328,9 @@ def _get_histogram(
def _merge_histogram(self, values: np.ndarray | pd.Series) -> None:
# values is the current array of values,
# that needs to be updated to the accumulated histogram
if type(values) is pd.Series:
values = pl.from_pandas(values)
values = values.to_numpy()
combined_values = np.concatenate([values, self._histogram_to_array()])
bin_counts, bin_edges = self._get_histogram(combined_values)
self._stored_histogram["histogram"]["bin_counts"] = bin_counts
Expand All @@ -1348,12 +1357,13 @@ def _update_histogram(self, df_series: pd.Series) -> None:
:type df_series: pandas.core.series.Series
:return:
"""
df_series = df_series.replace([np.inf, -np.inf], np.nan).dropna()
if df_series.empty:
df_series = pl.from_pandas(df_series, nan_to_null=True).cast(pl.Float64)
df_series = df_series.replace([np.inf, -np.inf], [None]).drop_nulls()
if df_series.is_empty():
return

if self._has_histogram:
self._merge_histogram(df_series.tolist())
self._merge_histogram(df_series.to_list())
else:
bin_counts, bin_edges = self._get_histogram(df_series)
self._stored_histogram["histogram"]["bin_counts"] = bin_counts
Expand Down Expand Up @@ -1741,7 +1751,12 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None:
:type profile: dict
:return: None
"""
if df_series_clean.empty:
df_series_clean = pl.from_pandas(df_series_clean)
if df_series_clean.dtype == pl.String:
df_series_clean = df_series_clean.str.strip_chars().cast(pl.Float64)
else:
df_series_clean = df_series_clean.cast(pl.Float64)
if df_series_clean.is_empty():
return

prev_dependent_properties = {
Expand All @@ -1751,6 +1766,7 @@ def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None:
"biased_kurtosis": self._biased_kurtosis,
}
subset_properties = copy.deepcopy(profile)
df_series_clean = df_series_clean.to_pandas()
df_series_clean = df_series_clean.astype(float)
super()._perform_property_calcs( # type: ignore
self.__calculations,
Expand All @@ -1769,7 +1785,10 @@ def _get_min(
prev_dependent_properties: dict,
subset_properties: dict,
) -> None:
df_series = pl.from_pandas(df_series)
min_value = df_series.min()
if self.min is not None:
min_value = type(self.min)(min_value)
self.min = min_value if not self.min else min(self.min, min_value)
subset_properties["min"] = min_value

Expand All @@ -1780,7 +1799,10 @@ def _get_max(
prev_dependent_properties: dict,
subset_properties: dict,
) -> None:
df_series = pl.from_pandas(df_series)
max_value = df_series.max()
if self.max is not None:
max_value = type(self.max)(max_value)
self.max = max_value if not self.max else max(self.max, max_value)
subset_properties["max"] = max_value

Expand All @@ -1793,7 +1815,7 @@ def _get_sum(
) -> None:
if np.isinf(self.sum) or (np.isnan(self.sum) and self.match_count > 0):
return

# df_series = pl.from_pandas(df_series)
sum_value = df_series.sum()
if np.isinf(sum_value) or (len(df_series) > 0 and np.isnan(sum_value)):
warnings.warn(
Expand Down

0 comments on commit 66520cc

Please sign in to comment.