From 1c5195c04bd20420bb8e6a50c879ed5f06b4eea2 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 15 Oct 2024 12:57:40 +0200 Subject: [PATCH 1/4] BUG: Fix copy semantics in ``__array__`` This fixes the semantics of ``__array__``. While rejecting ``copy=False`` is pretty harmless, ``copy=True`` should never have been ignored and is dangerous. --- pandas/core/arrays/arrow/array.py | 10 +++++++++- pandas/core/arrays/categorical.py | 14 +++++++++++--- pandas/core/arrays/datetimelike.py | 7 +++++++ pandas/core/arrays/interval.py | 5 +++++ pandas/core/arrays/masked.py | 9 ++++++++- pandas/core/arrays/numpy_.py | 3 +++ pandas/core/arrays/period.py | 5 +++++ pandas/core/arrays/sparse/array.py | 5 +++++ pandas/core/generic.py | 13 +++++++++++-- pandas/core/indexes/base.py | 6 +++++- pandas/core/indexes/multi.py | 3 +++ pandas/core/internals/construction.py | 2 +- pandas/core/series.py | 13 ++++++++++--- pandas/tests/extension/json/array.py | 10 +++++++++- 14 files changed, 92 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 619e7b3ccfb4f..940d08216a963 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -667,7 +667,15 @@ def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) + if copy is False: + # TODO: By using `zero_copy_only` it may be possible to implement this + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + elif copy is None: + copy = False # The NumPy copy=False meaning is different here. + + return self.to_numpy(dtype=dtype, copy=copy) def __invert__(self) -> Self: # This is a bit wise op for integer types diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 7cde4c53cb2f5..21b32b1c6fa89 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1663,7 +1663,7 @@ def __array__( Specifies the the dtype for the array. copy : bool or None, optional - Unused. + See :func:`numpy.asarray`. Returns ------- @@ -1686,13 +1686,21 @@ def __array__( >>> np.asarray(cat) array(['a', 'b'], dtype=object) """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + # TODO: using asarray_func because NumPy 1.x doesn't support copy=None + asarray_func = np.asarray if copy is None else np.array + ret = take_nd(self.categories._values, self._codes) if dtype and np.dtype(dtype) != self.categories.dtype: - return np.asarray(ret, dtype) + return asarray_func(ret, dtype) # When we're a Categorical[ExtensionArray], like Interval, # we need to ensure __array__ gets all the way to an # ndarray. - return np.asarray(ret) + return asarray_func(ret) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a25a698856747..9c821bf0d184e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -359,7 +359,14 @@ def __array__( ) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) return np.array(list(self), dtype=object) + + if copy is True: + return np.array(self._ndarray, dtype=dtype) return self._ndarray @overload diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 2ac9c77bef322..f6b815a874c10 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1606,6 +1606,11 @@ def __array__( Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + left = self._left right = self._right mask = self.isna() diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 92ed690e527c7..1be7787d67c93 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -581,7 +581,14 @@ def __array__( the array interface, return my values We return an object array here to preserve our scalar values """ - return self.to_numpy(dtype=dtype) + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + if copy is None: + copy = False # The NumPy copy=False meaning is different here. + return self.to_numpy(dtype=dtype, copy=copy) _HANDLED_TYPES: tuple[type, ...] diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index aafcd82114b97..e8b33937ca866 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -150,6 +150,9 @@ def dtype(self) -> NumpyEADtype: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: + if copy is not None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self._ndarray, dtype=dtype, copy=copy) return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 7d0ad74f851f0..b2a56b5ca2df3 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -389,6 +389,11 @@ def freqstr(self) -> str: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if dtype == "i8": return self.asi8 elif dtype == bool: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 0c76280e7fdb4..c1b3605c4e5b5 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -547,6 +547,11 @@ def from_spmatrix(cls, data: spmatrix) -> Self: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + fill_value = self.fill_value if self.sp_index.ngaps == 0: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 42516f0a85e07..02fbb08a14804 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2015,8 +2015,17 @@ def __array__( self, dtype: npt.DTypeLike | None = None, copy: bool | None = None ) -> np.ndarray: values = self._values - arr = np.asarray(values, dtype=dtype) - if astype_is_view(values.dtype, arr.dtype) and self._mgr.is_single_block: + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.asarray(values, dtype=dtype, copy=copy) + + if ( + copy is not False + and astype_is_view(values.dtype, arr.dtype) + and self._mgr.is_single_block + ): # Check if both conversions can be done without a copy if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view( values.dtype, arr.dtype diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 749a5fea4d513..5fdf073b9e8c9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -908,7 +908,11 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: """ The array interface, return my values. """ - return np.asarray(self._data, dtype=dtype) + if copy is None: + # Note, that the if branch exists for NumPy 1.x support + return np.asarray(self._data, dtype=dtype) + + return np.asarray(self._data, dtype=dtype, copy=copy) def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ae9b272af9fe9..55f96ce777d09 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1391,6 +1391,9 @@ def copy( # type: ignore[override] def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" + if copy is True: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self.values, dtype=dtype, copy=copy) return self.values def view(self, cls=None) -> Self: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 959e572b2b35b..0812ba5e6def4 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -258,7 +258,7 @@ def ndarray_to_mgr( # and a subsequent `astype` will not already result in a copy values = np.array(values, copy=True, order="F") else: - values = np.array(values, copy=False) + values = np.asarray(values) values = _ensure_2d(values) else: diff --git a/pandas/core/series.py b/pandas/core/series.py index fe2bb0b5aa5c3..2270e0966a505 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -842,7 +842,7 @@ def __array__( the dtype is inferred from the data. copy : bool or None, optional - Unused. + See :func:`numpy.asarray`. Returns ------- @@ -879,8 +879,15 @@ def __array__( dtype='datetime64[ns]') """ values = self._values - arr = np.asarray(values, dtype=dtype) - if astype_is_view(values.dtype, arr.dtype): + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.asarray(values, dtype=dtype, copy=copy) + + if copy is True: + return arr + if copy is False or astype_is_view(values.dtype, arr.dtype): arr = arr.view() arr.flags.writeable = False return arr diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 4fa48023fbc95..a68c8a06e1d18 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -148,12 +148,20 @@ def __ne__(self, other): return NotImplemented def __array__(self, dtype=None, copy=None): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if dtype is None: dtype = object if dtype == object: # on py38 builds it looks like numpy is inferring to a non-1D array return construct_1d_object_array_from_listlike(list(self)) - return np.asarray(self.data, dtype=dtype) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self.data, dtype=dtype) + return np.asarray(self.data, dtype=dtype, copy=copy) @property def nbytes(self) -> int: From 2183861e1addba9c883ed5e206db0450aa161d3b Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 15 Oct 2024 13:34:39 +0200 Subject: [PATCH 2/4] BUG: Fix one more path not translating ``copy=`` correctly --- pandas/core/arrays/categorical.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 21b32b1c6fa89..c29466b13f6ff 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -579,10 +579,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: raise ValueError("Cannot convert float NaN to integer") elif len(self.codes) == 0 or len(self.categories) == 0: - result = np.array( + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + asarray_func = np.array if copy else np.asarray + result = asarray_func( self, dtype=dtype, - copy=copy, ) else: From 404827b81381439626180fd9a690776d5eacc337 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 15 Oct 2024 14:16:38 +0200 Subject: [PATCH 3/4] BUG: Avoid asarray with copy= (it was added in 2.0) --- pandas/core/arrays/numpy_.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/indexes/base.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index e8b33937ca866..9f7238a97d808 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -152,7 +152,7 @@ def __array__( ) -> np.ndarray: if copy is not None: # Note: branch avoids `copy=None` for NumPy 1.x support - return np.asarray(self._ndarray, dtype=dtype, copy=copy) + return np.array(self._ndarray, dtype=dtype, copy=copy) return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 02fbb08a14804..0a43556496973 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2019,7 +2019,7 @@ def __array__( # Note: branch avoids `copy=None` for NumPy 1.x support arr = np.asarray(values, dtype=dtype) else: - arr = np.asarray(values, dtype=dtype, copy=copy) + arr = np.array(values, dtype=dtype, copy=copy) if ( copy is not False diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5fdf073b9e8c9..cf3d1e6a2ee2d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -912,7 +912,7 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: # Note, that the if branch exists for NumPy 1.x support return np.asarray(self._data, dtype=dtype) - return np.asarray(self._data, dtype=dtype, copy=copy) + return np.array(self._data, dtype=dtype, copy=copy) def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs): From ec08728216a172f518e230ccc637e44cf0a65582 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 15 Oct 2024 14:44:13 +0200 Subject: [PATCH 4/4] More fixes found by typing checks (or working around them) --- pandas/core/arrays/categorical.py | 20 +++++++++----------- pandas/core/indexes/multi.py | 2 +- pandas/core/series.py | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c29466b13f6ff..ac64c57bece9f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -581,11 +581,10 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: elif len(self.codes) == 0 or len(self.categories) == 0: # For NumPy 1.x compatibility we cannot use copy=None. And # `copy=False` has the meaning of `copy=None` here: - asarray_func = np.array if copy else np.asarray - result = asarray_func( - self, - dtype=dtype, - ) + if not copy: + result = np.asarray(self, dtype=dtype) + else: + result = np.array(self, dtype=dtype) else: # GH8628 (PERF): astype category codes instead of astyping array @@ -1693,16 +1692,15 @@ def __array__( "Unable to avoid copy while creating an array as requested." ) - # TODO: using asarray_func because NumPy 1.x doesn't support copy=None - asarray_func = np.asarray if copy is None else np.array - ret = take_nd(self.categories._values, self._codes) - if dtype and np.dtype(dtype) != self.categories.dtype: - return asarray_func(ret, dtype) # When we're a Categorical[ExtensionArray], like Interval, # we need to ensure __array__ gets all the way to an # ndarray. - return asarray_func(ret) + + if copy is None: + # Branch required since copy=None is not defined on 1.x + return np.asarray(ret, dtype=dtype) + return np.array(ret, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 55f96ce777d09..f21ab28061e9d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1393,7 +1393,7 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" if copy is True: # Note: branch avoids `copy=None` for NumPy 1.x support - return np.asarray(self.values, dtype=dtype, copy=copy) + return np.array(self.values, dtype=dtype, copy=copy) return self.values def view(self, cls=None) -> Self: diff --git a/pandas/core/series.py b/pandas/core/series.py index 2270e0966a505..97193c7b564e9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -883,7 +883,7 @@ def __array__( # Note: branch avoids `copy=None` for NumPy 1.x support arr = np.asarray(values, dtype=dtype) else: - arr = np.asarray(values, dtype=dtype, copy=copy) + arr = np.array(values, dtype=dtype, copy=copy) if copy is True: return arr