Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: RangeIndex.take/reindex/join with single/no value returns RangeIndex #57752

Closed
wants to merge 9 commits into from
6 changes: 3 additions & 3 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -257,9 +257,9 @@ Performance improvements
- Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)
- Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`)
- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`)
- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`)
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`)
- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`)
- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`)
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
- Performance improvement in indexing operations for string dtypes (:issue:`56997`)

Expand Down
1 change: 0 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4241,7 +4241,6 @@ def join(

return self._join_via_get_indexer(other, how, sort)

@final
def _join_empty(
self, other: Index, how: JoinHow, sort: bool
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
Expand Down
34 changes: 27 additions & 7 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,18 +472,31 @@ def _shallow_copy(self, values, name: Hashable = no_default):

if values.dtype.kind == "f":
return Index(values, name=name, dtype=np.float64)
if values.dtype.kind == "i" and values.ndim == 1 and len(values) > 1:
if values.dtype.kind == "i" and values.ndim == 1:
# GH 46675 & 43885: If values is equally spaced, return a
# more memory-compact RangeIndex instead of Index with 64-bit dtype
if len(values) == 0:
return type(self)._simple_new(_empty_range, name=name)
elif len(values) == 1:
start = values[0]
new_range = range(start, start + self.step, self.step)
return type(self)._simple_new(new_range, name=name)
diff = values[1] - values[0]
if not missing.isna(diff) and diff != 0:
maybe_range_indexer, remainder = np.divmod(values - values[0], diff)
if (
lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer))
and not remainder.any()
):
if len(values) == 2:
# Can skip is_range_indexer check
new_range = range(values[0], values[-1] + diff, diff)
return type(self)._simple_new(new_range, name=name)
else:
maybe_range_indexer, remainder = np.divmod(values - values[0], diff)
if (
lib.is_range_indexer(
maybe_range_indexer, len(maybe_range_indexer)
)
and not remainder.any()
):
new_range = range(values[0], values[-1] + diff, diff)
return type(self)._simple_new(new_range, name=name)
return self._constructor._simple_new(values, name=name)

def _view(self) -> Self:
Expand Down Expand Up @@ -897,12 +910,19 @@ def symmetric_difference(
result = result.rename(result_name)
return result

def _join_empty(
self, other: Index, how: JoinHow, sort: bool
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
if other.dtype.kind == "i":
other = self._shallow_copy(other._values, name=other.name)
return super()._join_empty(other, how=how, sort=sort)

def _join_monotonic(
self, other: Index, how: JoinHow = "left"
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
# This currently only gets called for the monotonic increasing case
if not isinstance(other, type(self)):
maybe_ri = self._shallow_copy(other._values)
maybe_ri = self._shallow_copy(other._values, name=other.name)
if not isinstance(maybe_ri, type(self)):
return super()._join_monotonic(other, how=how)
other = maybe_ri
Expand Down
8 changes: 7 additions & 1 deletion pandas/tests/indexes/ranges/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,15 @@ def test_join_self(self, join_type):
[-1, -1, 0, 1],
"outer",
],
[RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "left"],
[RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "right"],
[RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "inner"],
[RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "outer"],
],
)
@pytest.mark.parametrize("right_type", [RangeIndex, lambda x: Index(list(x))])
@pytest.mark.parametrize(
"right_type", [RangeIndex, lambda x: Index(list(x), dtype=x.dtype)]
)
def test_join_preserves_rangeindex(
left, right, expected, expected_lidx, expected_ridx, how, right_type
):
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/indexes/ranges/test_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,26 @@ def test_range_index_rsub_by_const(self):
tm.assert_index_equal(result, expected)


def test_reindex_1_value_returns_rangeindex():
ri = RangeIndex(0, 10, 2, name="foo")
result, result_indexer = ri.reindex([2])
expected = RangeIndex(2, 4, 2, name="foo")
tm.assert_index_equal(result, expected, exact=True)

expected_indexer = np.array([1], dtype=np.intp)
tm.assert_numpy_array_equal(result_indexer, expected_indexer)


def test_reindex_empty_returns_rangeindex():
ri = RangeIndex(0, 10, 2, name="foo")
result, result_indexer = ri.reindex([])
expected = RangeIndex(0, 0, 2, name="foo")
tm.assert_index_equal(result, expected, exact=True)

expected_indexer = np.array([], dtype=np.intp)
tm.assert_numpy_array_equal(result_indexer, expected_indexer)


def test_append_non_rangeindex_return_rangeindex():
ri = RangeIndex(1)
result = ri.append(Index([1]))
Expand Down Expand Up @@ -653,6 +673,21 @@ def test_take_return_rangeindex():
tm.assert_index_equal(result, expected, exact=True)


@pytest.mark.parametrize(
"rng, exp_rng",
[
[range(5), range(3, 4)],
[range(0, -10, -2), range(-6, -8, -2)],
[range(0, 10, 2), range(6, 8, 2)],
],
)
def test_take_1_value_returns_rangeindex(rng, exp_rng):
ri = RangeIndex(rng, name="foo")
result = ri.take([3])
expected = RangeIndex(exp_rng, name="foo")
tm.assert_index_equal(result, expected, exact=True)


def test_append_one_nonempty_preserve_step():
expected = RangeIndex(0, -1, -1)
result = RangeIndex(0).append([expected])
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ def test_loc_getitem_list_with_fail(self):

s.loc[[2]]

msg = f"\"None of [Index([3], dtype='{np.dtype(int)}')] are in the [index]"
msg = "None of [RangeIndex(start=3, stop=4, step=1)] are in the [index]"
with pytest.raises(KeyError, match=re.escape(msg)):
s.loc[[3]]

Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/io/pytables/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,8 @@ def test_append_to_multiple_min_itemsize(setup_path):
}
)
expected = df.iloc[[0]]
# Reading/writing RangeIndex info is not supported yet
expected.index = Index(list(range(len(expected.index))))

with ensure_clean_store(setup_path) as store:
store.append_to_multiple(
Expand Down