diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 59578b682235c..1b0f74ad088cd 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -272,11 +272,11 @@ Performance improvements - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`) - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) -- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) +- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) - Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`) -- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`) -- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`) -- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`) +- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`) +- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) +- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 105c08af1406b..ca7f647c755d7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4235,7 +4235,6 @@ def join( return self._join_via_get_indexer(other, how, sort) - @final def _join_empty( self, other: Index, how: JoinHow, sort: bool ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 24f53f16e1985..63fcddd961e04 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -472,18 +472,31 @@ def _shallow_copy(self, values, name: Hashable = no_default): if values.dtype.kind == "f": return Index(values, name=name, dtype=np.float64) - if values.dtype.kind == "i" and values.ndim == 1 and len(values) > 1: + if values.dtype.kind == "i" and values.ndim == 1: # GH 46675 & 43885: If values is equally spaced, return a # more memory-compact RangeIndex instead of Index with 64-bit dtype + if len(values) == 0: + return type(self)._simple_new(_empty_range, name=name) + elif len(values) == 1: + start = values[0] + new_range = range(start, start + self.step, self.step) + return type(self)._simple_new(new_range, name=name) diff = values[1] - values[0] if not missing.isna(diff) and diff != 0: - maybe_range_indexer, remainder = np.divmod(values - values[0], diff) - if ( - lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer)) - and not remainder.any() - ): + if len(values) == 2: + # Can skip is_range_indexer check new_range = range(values[0], values[-1] + diff, diff) return type(self)._simple_new(new_range, name=name) + else: + maybe_range_indexer, remainder = np.divmod(values - values[0], diff) + if ( + lib.is_range_indexer( + maybe_range_indexer, len(maybe_range_indexer) + ) + and not remainder.any() + ): + new_range = range(values[0], values[-1] + diff, diff) + return type(self)._simple_new(new_range, name=name) return self._constructor._simple_new(values, name=name) def _view(self) -> Self: @@ -894,12 +907,19 @@ def symmetric_difference( result = result.rename(result_name) return result + def _join_empty( + self, other: Index, how: JoinHow, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + if other.dtype.kind == "i": + other = self._shallow_copy(other._values, name=other.name) + return super()._join_empty(other, how=how, sort=sort) + def _join_monotonic( self, other: Index, how: JoinHow = "left" ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: # This currently only gets called for the monotonic increasing case if not isinstance(other, type(self)): - maybe_ri = self._shallow_copy(other._values) + maybe_ri = self._shallow_copy(other._values, name=other.name) if not isinstance(maybe_ri, type(self)): return super()._join_monotonic(other, how=how) other = maybe_ri @@ -1075,6 +1095,8 @@ def __getitem__(self, key): """ Conserve RangeIndex type for scalar and slice keys. """ + if key is Ellipsis: + key = slice(None) if isinstance(key, slice): return self._getitem_slice(key) elif is_integer(key): @@ -1094,17 +1116,20 @@ def __getitem__(self, key): ) elif com.is_bool_indexer(key): if isinstance(getattr(key, "dtype", None), ExtensionDtype): - np_key = key.to_numpy(dtype=bool, na_value=False) + key = key.to_numpy(dtype=bool, na_value=False) else: - np_key = np.asarray(key, dtype=bool) - check_array_indexer(self._range, np_key) # type: ignore[arg-type] + key = np.asarray(key, dtype=bool) + check_array_indexer(self._range, key) # type: ignore[arg-type] # Short circuit potential _shallow_copy check - if np_key.all(): + if key.all(): return self._simple_new(self._range, name=self.name) - elif not np_key.any(): + elif not key.any(): return self._simple_new(_empty_range, name=self.name) - return self.take(np.flatnonzero(np_key)) - return super().__getitem__(key) + key = np.flatnonzero(key) + try: + return self.take(key) + except (TypeError, ValueError): + return super().__getitem__(key) def _getitem_slice(self, slobj: slice) -> Self: """ diff --git a/pandas/tests/indexes/ranges/test_join.py b/pandas/tests/indexes/ranges/test_join.py index ca3af607c0a38..09db30b1d4c51 100644 --- a/pandas/tests/indexes/ranges/test_join.py +++ b/pandas/tests/indexes/ranges/test_join.py @@ -207,9 +207,15 @@ def test_join_self(self, join_type): [-1, -1, 0, 1], "outer", ], + [RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "left"], + [RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "right"], + [RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "inner"], + [RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "outer"], ], ) -@pytest.mark.parametrize("right_type", [RangeIndex, lambda x: Index(list(x))]) +@pytest.mark.parametrize( + "right_type", [RangeIndex, lambda x: Index(list(x), dtype=x.dtype)] +) def test_join_preserves_rangeindex( left, right, expected, expected_lidx, expected_ridx, how, right_type ): diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 8c24ce5d699d5..3040b4c13dc17 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -608,6 +608,26 @@ def test_range_index_rsub_by_const(self): tm.assert_index_equal(result, expected) +def test_reindex_1_value_returns_rangeindex(): + ri = RangeIndex(0, 10, 2, name="foo") + result, result_indexer = ri.reindex([2]) + expected = RangeIndex(2, 4, 2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + expected_indexer = np.array([1], dtype=np.intp) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + +def test_reindex_empty_returns_rangeindex(): + ri = RangeIndex(0, 10, 2, name="foo") + result, result_indexer = ri.reindex([]) + expected = RangeIndex(0, 0, 2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + expected_indexer = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + def test_append_non_rangeindex_return_rangeindex(): ri = RangeIndex(1) result = ri.append(Index([1])) @@ -653,6 +673,21 @@ def test_take_return_rangeindex(): tm.assert_index_equal(result, expected, exact=True) +@pytest.mark.parametrize( + "rng, exp_rng", + [ + [range(5), range(3, 4)], + [range(0, -10, -2), range(-6, -8, -2)], + [range(0, 10, 2), range(6, 8, 2)], + ], +) +def test_take_1_value_returns_rangeindex(rng, exp_rng): + ri = RangeIndex(rng, name="foo") + result = ri.take([3]) + expected = RangeIndex(exp_rng, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + def test_append_one_nonempty_preserve_step(): expected = RangeIndex(0, -1, -1) result = RangeIndex(0).append([expected]) @@ -695,3 +730,25 @@ def test_getitem_boolmask_wrong_length(): ri = RangeIndex(4, name="foo") with pytest.raises(IndexError, match="Boolean index has wrong length"): ri[[True]] + + +def test_getitem_integers_return_rangeindex(): + result = RangeIndex(0, 10, 2, name="foo")[[0, -1]] + expected = RangeIndex(start=0, stop=16, step=8, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + result = RangeIndex(0, 10, 2, name="foo")[[3]] + expected = RangeIndex(start=6, stop=8, step=2, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_empty_return_rangeindex(): + result = RangeIndex(0, 10, 2, name="foo")[[]] + expected = RangeIndex(start=0, stop=0, step=1, name="foo") + tm.assert_index_equal(result, expected, exact=True) + + +def test_getitem_integers_return_index(): + result = RangeIndex(0, 10, 2, name="foo")[[0, 1, -1]] + expected = Index([0, 2, 8], dtype="int64", name="foo") + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 7112b866018a2..c01a8647dd07d 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -509,7 +509,7 @@ def test_loc_getitem_list_with_fail(self): s.loc[[2]] - msg = f"\"None of [Index([3], dtype='{np.dtype(int)}')] are in the [index]" + msg = "None of [RangeIndex(start=3, stop=4, step=1)] are in the [index]" with pytest.raises(KeyError, match=re.escape(msg)): s.loc[[3]] diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index cc61d8bca7de3..b722a7f179479 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -968,6 +968,8 @@ def test_append_to_multiple_min_itemsize(setup_path): } ) expected = df.iloc[[0]] + # Reading/writing RangeIndex info is not supported yet + expected.index = Index(list(range(len(expected.index)))) with ensure_clean_store(setup_path) as store: store.append_to_multiple(