From ad64b75a559b0538b4c7fe790da82898bb20b55d Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 17 Feb 2024 16:42:42 -0500 Subject: [PATCH 1/3] REGR: Index.map adding back tz to tz-agnostic result --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/base.py | 18 ++++++++++-------- .../indexes/datetimes/methods/test_map.py | 13 +++++++++++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index ca4fef4f57fb6..ec61cc2987afd 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -36,6 +36,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) +- Fixed regression in :meth:`Index.map` that would not change the dtype when the provided mapping would change data from tz-aware to tz-agnostic or tz-agnostic to tz-aware (:issue:`57192`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) - Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c17e01b85fa84..b56c4fc1d36bc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6124,14 +6124,16 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): # empty dtype = self.dtype - # e.g. if we are floating and new_values is all ints, then we - # don't want to cast back to floating. But if we are UInt64 - # and new_values is all ints, we want to try. - same_dtype = lib.infer_dtype(new_values, skipna=False) == self.inferred_type - if same_dtype: - new_values = maybe_cast_pointwise_result( - new_values, self.dtype, same_dtype=same_dtype - ) + if self.inferred_type != "datetime64": + # e.g. if we are floating and new_values is all ints, then we + # don't want to cast back to floating. But if we are UInt64 + # and new_values is all ints, we want to try. + # GH#57192 - we skip datetime64 because inference from values is reliable + same_dtype = lib.infer_dtype(new_values, skipna=False) == self.inferred_type + if same_dtype: + new_values = maybe_cast_pointwise_result( + new_values, self.dtype, same_dtype=same_dtype + ) return Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) diff --git a/pandas/tests/indexes/datetimes/methods/test_map.py b/pandas/tests/indexes/datetimes/methods/test_map.py index f35f07bd32068..b012c0b79b68e 100644 --- a/pandas/tests/indexes/datetimes/methods/test_map.py +++ b/pandas/tests/indexes/datetimes/methods/test_map.py @@ -45,3 +45,16 @@ def test_index_map(self, name): ) exp_index = MultiIndex.from_product(((2018,), range(1, 7)), names=[name, name]) tm.assert_index_equal(index, exp_index) + + @pytest.mark.parametrize("input_tz", ["UTC", None]) + @pytest.mark.parametrize("output_tz", ["UTC", None]) + def test_mapping_tz_to_tz_agnostic(self, input_tz, output_tz): + # GH#57192 + index = date_range("2018-01-01", periods=6, freq="ME", tz=input_tz) + expected = date_range("2018-01-01", periods=6, freq="ME", tz=output_tz) + if input_tz == "UTC" and output_tz == "UTC": + method = "tz_convert" + else: + method = "tz_localize" + result = index.map(lambda x: getattr(x, method)(output_tz)) + tm.assert_index_equal(result, expected) From ac14e3c1ae0735d47128cc583cdda591c76e7958 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 31 Mar 2024 13:38:39 -0400 Subject: [PATCH 2/3] Rework --- doc/source/whatsnew/v2.2.2.rst | 1 + pandas/_libs/lib.pyi | 1 + pandas/_libs/lib.pyx | 24 ++++++++++++++++++++++++ pandas/core/arrays/datetimes.py | 12 +++++++++--- pandas/core/indexes/base.py | 18 ++++++++---------- 5 files changed, 43 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 0dac3660c76b2..9c62d2be0f1fa 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -16,6 +16,7 @@ Fixed regressions - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pyarrow nullable on with missing values (:issue:`57664`) - Avoid issuing a spurious ``DeprecationWarning`` when a custom :class:`DataFrame` or :class:`Series` subclass method is called (:issue:`57553`) +- Fixed regression in :meth:`Index.map` that would not change the dtype when the provided mapping would change data from tz-aware to tz-agnostic or tz-agnostic to tz-aware (:issue:`57192`) - Fixed regression in precision of :func:`to_datetime` with string and ``unit`` input (:issue:`57051`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index b39d32d069619..2b6fd5288450a 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -54,6 +54,7 @@ def is_timedelta_or_timedelta64_array( values: np.ndarray, skipna: bool = True ) -> bool: ... def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ... +def is_datetime_naive_array(values: np.ndarray) -> bool: ... def is_time_array(values: np.ndarray, skipna: bool = ...): ... def is_date_array(values: np.ndarray, skipna: bool = ...): ... def is_datetime_array(values: np.ndarray, skipna: bool = ...): ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a2205454a5a46..e56efb83daf70 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2068,6 +2068,30 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: return True +def is_datetime_naive_array(values: ndarray) -> bool: + """ + Check values have are datetime naive. + Doesn't check values are datetime-like types. + """ + cdef: + Py_ssize_t j, n = len(values) + object tz + + if n == 0: + return False + + for j in range(n): + # Compare val's timezone with the reference timezone + # NaT can coexist with tz-aware datetimes, so skip if encountered + # val = values[j] + # if val is not NaT and val is not None and not util.is_nan(val): + tz = getattr(values[j], "tzinfo", None) + if tz is not None: + return False + + return True + + @cython.internal cdef class TimedeltaValidator(TemporalValidator): cdef bint is_value_typed(self, object value) except -1: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index d446407ec3d01..313eb2046a84f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -290,9 +290,15 @@ def _scalar_type(self) -> type[Timestamp]: @classmethod def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: - if lib.infer_dtype(scalars, skipna=True) not in ["datetime", "datetime64"]: - # TODO: require any NAs be valid-for-DTA - # TODO: if dtype is passed, check for tzawareness compat? + # TODO: require any NAs be valid-for-DTA + # TODO: if dtype is passed, check for tzawareness compat? + if not lib.is_datetime64_array(scalars): + raise ValueError + elif isinstance( + dtype, DatetimeTZDtype + ) and not lib.is_datetime_with_singletz_array(scalars): + raise ValueError + elif isinstance(dtype, np.dtype) and not lib.is_datetime_naive_array(scalars): raise ValueError return cls._from_sequence(scalars, dtype=dtype) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4398fff70c5b3..30cf6f0b866ee 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6103,16 +6103,14 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): # empty dtype = self.dtype - if self.inferred_type != "datetime64": - # e.g. if we are floating and new_values is all ints, then we - # don't want to cast back to floating. But if we are UInt64 - # and new_values is all ints, we want to try. - # GH#57192 - we skip datetime64 because inference from values is reliable - same_dtype = lib.infer_dtype(new_values, skipna=False) == self.inferred_type - if same_dtype: - new_values = maybe_cast_pointwise_result( - new_values, self.dtype, same_dtype=same_dtype - ) + # e.g. if we are floating and new_values is all ints, then we + # don't want to cast back to floating. But if we are UInt64 + # and new_values is all ints, we want to try. + same_dtype = lib.infer_dtype(new_values, skipna=False) == self.inferred_type + if same_dtype: + new_values = maybe_cast_pointwise_result( + new_values, self.dtype, same_dtype=same_dtype + ) return Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name) From effcf58a1d6ffe5a5631fb77834c394a947e58a7 Mon Sep 17 00:00:00 2001 From: richard Date: Sun, 31 Mar 2024 13:39:54 -0400 Subject: [PATCH 3/3] cleanup --- pandas/_libs/lib.pyx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e56efb83daf70..407b26b790128 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2081,10 +2081,6 @@ def is_datetime_naive_array(values: ndarray) -> bool: return False for j in range(n): - # Compare val's timezone with the reference timezone - # NaT can coexist with tz-aware datetimes, so skip if encountered - # val = values[j] - # if val is not NaT and val is not None and not util.is_nan(val): tz = getattr(values[j], "tzinfo", None) if tz is not None: return False