From f573ec49bd6fbd7370732af943ff425ea0c2c223 Mon Sep 17 00:00:00 2001 From: Vamp1899 <67709006+Vamp1899@users.noreply.github.com> Date: Thu, 14 Jul 2022 12:45:53 +0530 Subject: [PATCH 1/2] Added data privacy as well as test file --- src/pandas_profiling/config.py | 64 +++++++++++++++++++++++++- src/pandas_profiling/profile_report.py | 34 ++++++++++++-- tests/issues/test_issue983.py | 8 ++++ 3 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 tests/issues/test_issue983.py diff --git a/src/pandas_profiling/config.py b/src/pandas_profiling/config.py index cb8034bd0..91667959c 100644 --- a/src/pandas_profiling/config.py +++ b/src/pandas_profiling/config.py @@ -1,9 +1,13 @@ """Configuration for the package.""" from enum import Enum from typing import Any, Dict, List, Optional - +import warnings from pydantic import BaseModel, BaseSettings, Field +# Comment this function to see Warnings in console +def warn(*args, **kwargs): + pass +warnings.warn = warn def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: """ @@ -185,6 +189,22 @@ class Html(BaseModel): full_width: bool = False +class JsonNonFiniteEncoding(Enum): + # Use the default python behaviour, which violates the official JSON standard, basically allow_nan = False + __default = 0 + # Encode non-finite numbers as null values, allow_nan = True + __num_null = 1 + # Encode non-finite floats as null values, allow_nan = True + __float_null = 2 + + def fetch_python(self): + return self.__default + + def fetch_null_values(self): + return self.__num_null + + def fetch_float_values(self): + return self.__float_null class Duplicates(BaseModel): head: int = 10 @@ -299,6 +319,11 @@ class Config: n_freq_table_max: int = 10 n_extreme_obs: int = 10 + #JSON for non finite values + + Jsnf_instance = JsonNonFiniteEncoding + json_non_finite_encoding: Jsnf_instance = Jsnf_instance._JsonNonFiniteEncoding__num_null.value + # Report rendering report: Report = Report() html: Html = Html() @@ -308,6 +333,43 @@ def update(self, updates: dict) -> "Settings": update = _merge_dictionaries(self.dict(), updates) return self.parse_obj(self.copy(update=update)) +class PandasSettings(Settings): + pass + +class SparkSettings(Settings): + # TO-DO write description + vars: Univariate = Univariate() + + vars.num.low_categorical_threshold = 0 + + infer_dtypes = False + + correlations: Dict[str, Correlation] = { + "spearman": Correlation(key="spearman"), + "pearson": Correlation(key="pearson"), + "kendall": Correlation(key="kendall"), + "cramers": Correlation(key="cramers"), + "phi_k": Correlation(key="phi_k"), + } + correlations["pearson"].calculate = True + correlations["spearman"].calculate = True + correlations["kendall"].calculate = False + correlations["cramers"].calculate = False + correlations["phi_k"].calculate = False + + interactions: Interactions = Interactions() + interactions.continuous = False + + missing_diagrams: Dict[str, bool] = { + "bar": False, + "matrix": False, + "dendrogram": False, + "heatmap": False, + } + + samples: Samples = Samples() + samples.tail = 0 + samples.random = 0 class Config: arg_groups: Dict[str, Any] = { diff --git a/src/pandas_profiling/profile_report.py b/src/pandas_profiling/profile_report.py index 3d838de71..06d7326cb 100644 --- a/src/pandas_profiling/profile_report.py +++ b/src/pandas_profiling/profile_report.py @@ -3,14 +3,19 @@ import warnings from pathlib import Path from typing import Any, Dict, Optional, Union - +import math import numpy as np import pandas as pd import yaml from tqdm.auto import tqdm from visions import VisionsTypeset - -from pandas_profiling.config import Config, Settings +from pandas_profiling.config import ( + Config, + PandasSettings, + Settings, + SparkSettings, + JsonNonFiniteEncoding, +) from pandas_profiling.expectations_report import ExpectationsReport from pandas_profiling.model.alerts import AlertType from pandas_profiling.model.describe import describe as describe_df @@ -331,6 +336,22 @@ def encode_it(o: Any) -> Any: else: if isinstance(o, (bool, int, float, str)): return o + elif isinstance(o, float): + if not math.isfinite(o): + # Special handling for non-finite floats. + # This is necessary because JSON does not support NaN/Infinity values. + # The default in Python is to generate invalid JSON. + # Depending on the configuration, we can encode them as null values, + # stringify the non-finite value, or output it as is to keep the default ,Python behaviour. + Jsnf_instance = JsonNonFiniteEncoding + if self.config.json_non_finite_encoding.value == Jsnf_instance._JsonNonFiniteEncoding__num_null.value: + return None + elif self.config.json_non_finite_encoding.value == Jsnf_instance._JsonNonFiniteEncoding__float_null.value: + return str(o) + else: + return o + else: + return o elif isinstance(o, list): return [encode_it(v) for v in o] elif isinstance(o, set): @@ -420,3 +441,10 @@ def _repr_html_(self) -> None: def __repr__(self) -> str: """Override so that Jupyter Notebook does not print the object.""" return "" + + def get_default_settings(self, df) -> Settings: + if isinstance(df, (pd.DataFrame, pd.Series)): + return PandasSettings() + else: + return SparkSettings() + diff --git a/tests/issues/test_issue983.py b/tests/issues/test_issue983.py new file mode 100644 index 000000000..e08ec364e --- /dev/null +++ b/tests/issues/test_issue983.py @@ -0,0 +1,8 @@ +import pandas as pd +from pandas_profiling import ProfileReport +import numpy as np +df = pd.DataFrame([1, 1, np.nan], columns=["a"]) + +profile = ProfileReport(df, title="Pandas Profiling Report", minimal=True) + +print(profile.to_json()) From 747b86d389911d03f80afaec7bb3e555ecde718b Mon Sep 17 00:00:00 2001 From: Nik21 <67709006+Vamp1899@users.noreply.github.com> Date: Thu, 14 Jul 2022 12:56:55 +0530 Subject: [PATCH 2/2] Update profile_report.py Removed float instance check from encode_it function --- src/pandas_profiling/profile_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pandas_profiling/profile_report.py b/src/pandas_profiling/profile_report.py index 06d7326cb..3edc292d9 100644 --- a/src/pandas_profiling/profile_report.py +++ b/src/pandas_profiling/profile_report.py @@ -334,7 +334,7 @@ def encode_it(o: Any) -> Any: if isinstance(o, dict): return {encode_it(k): encode_it(v) for k, v in o.items()} else: - if isinstance(o, (bool, int, float, str)): + if isinstance(o, (bool, int, str)): return o elif isinstance(o, float): if not math.isfinite(o):