Skip to content

Commit

Permalink
PERF: RangeIndex.__getitem__ with integers return RangeIndex (pandas-…
Browse files Browse the repository at this point in the history
…dev#57770)

* PERF: RangeIndex.take with 1 value return RangeIndex

* add issue number

* Move to _shallow_copy, support empty join as well

* Fix self.name

* FIx error message

* Fix hdf test

* PERF: RangeIndex.__getitem__ with integers return RangeIndex

* PERF: RangeIndex.__getitem__ with integers return RangeIndex

* Handle ellipse

* Catch ValueError
  • Loading branch information
mroeschke authored and pmhatre1 committed May 7, 2024
1 parent 984cac3 commit 3d3a0b5
Show file tree
Hide file tree
Showing 7 changed files with 110 additions and 21 deletions.
8 changes: 4 additions & 4 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -272,11 +272,11 @@ Performance improvements
- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
- Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)
- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)
- Performance improvement in :meth:`RangeIndex.append` when appending the same index (:issue:`57252`)
- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`)
- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`)
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`)
- Performance improvement in :meth:`RangeIndex.join` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57651`, :issue:`57752`)
- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`)
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
- Performance improvement in indexing operations for string dtypes (:issue:`56997`)

Expand Down
1 change: 0 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4235,7 +4235,6 @@ def join(

return self._join_via_get_indexer(other, how, sort)

@final
def _join_empty(
self, other: Index, how: JoinHow, sort: bool
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
Expand Down
53 changes: 39 additions & 14 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,18 +472,31 @@ def _shallow_copy(self, values, name: Hashable = no_default):

if values.dtype.kind == "f":
return Index(values, name=name, dtype=np.float64)
if values.dtype.kind == "i" and values.ndim == 1 and len(values) > 1:
if values.dtype.kind == "i" and values.ndim == 1:
# GH 46675 & 43885: If values is equally spaced, return a
# more memory-compact RangeIndex instead of Index with 64-bit dtype
if len(values) == 0:
return type(self)._simple_new(_empty_range, name=name)
elif len(values) == 1:
start = values[0]
new_range = range(start, start + self.step, self.step)
return type(self)._simple_new(new_range, name=name)
diff = values[1] - values[0]
if not missing.isna(diff) and diff != 0:
maybe_range_indexer, remainder = np.divmod(values - values[0], diff)
if (
lib.is_range_indexer(maybe_range_indexer, len(maybe_range_indexer))
and not remainder.any()
):
if len(values) == 2:
# Can skip is_range_indexer check
new_range = range(values[0], values[-1] + diff, diff)
return type(self)._simple_new(new_range, name=name)
else:
maybe_range_indexer, remainder = np.divmod(values - values[0], diff)
if (
lib.is_range_indexer(
maybe_range_indexer, len(maybe_range_indexer)
)
and not remainder.any()
):
new_range = range(values[0], values[-1] + diff, diff)
return type(self)._simple_new(new_range, name=name)
return self._constructor._simple_new(values, name=name)

def _view(self) -> Self:
Expand Down Expand Up @@ -894,12 +907,19 @@ def symmetric_difference(
result = result.rename(result_name)
return result

def _join_empty(
self, other: Index, how: JoinHow, sort: bool
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
if other.dtype.kind == "i":
other = self._shallow_copy(other._values, name=other.name)
return super()._join_empty(other, how=how, sort=sort)

def _join_monotonic(
self, other: Index, how: JoinHow = "left"
) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
# This currently only gets called for the monotonic increasing case
if not isinstance(other, type(self)):
maybe_ri = self._shallow_copy(other._values)
maybe_ri = self._shallow_copy(other._values, name=other.name)
if not isinstance(maybe_ri, type(self)):
return super()._join_monotonic(other, how=how)
other = maybe_ri
Expand Down Expand Up @@ -1075,6 +1095,8 @@ def __getitem__(self, key):
"""
Conserve RangeIndex type for scalar and slice keys.
"""
if key is Ellipsis:
key = slice(None)
if isinstance(key, slice):
return self._getitem_slice(key)
elif is_integer(key):
Expand All @@ -1094,17 +1116,20 @@ def __getitem__(self, key):
)
elif com.is_bool_indexer(key):
if isinstance(getattr(key, "dtype", None), ExtensionDtype):
np_key = key.to_numpy(dtype=bool, na_value=False)
key = key.to_numpy(dtype=bool, na_value=False)
else:
np_key = np.asarray(key, dtype=bool)
check_array_indexer(self._range, np_key) # type: ignore[arg-type]
key = np.asarray(key, dtype=bool)
check_array_indexer(self._range, key) # type: ignore[arg-type]
# Short circuit potential _shallow_copy check
if np_key.all():
if key.all():
return self._simple_new(self._range, name=self.name)
elif not np_key.any():
elif not key.any():
return self._simple_new(_empty_range, name=self.name)
return self.take(np.flatnonzero(np_key))
return super().__getitem__(key)
key = np.flatnonzero(key)
try:
return self.take(key)
except (TypeError, ValueError):
return super().__getitem__(key)

def _getitem_slice(self, slobj: slice) -> Self:
"""
Expand Down
8 changes: 7 additions & 1 deletion pandas/tests/indexes/ranges/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,15 @@ def test_join_self(self, join_type):
[-1, -1, 0, 1],
"outer",
],
[RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "left"],
[RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "right"],
[RangeIndex(2), RangeIndex(0), RangeIndex(0), [], None, "inner"],
[RangeIndex(2), RangeIndex(0), RangeIndex(2), None, [-1, -1], "outer"],
],
)
@pytest.mark.parametrize("right_type", [RangeIndex, lambda x: Index(list(x))])
@pytest.mark.parametrize(
"right_type", [RangeIndex, lambda x: Index(list(x), dtype=x.dtype)]
)
def test_join_preserves_rangeindex(
left, right, expected, expected_lidx, expected_ridx, how, right_type
):
Expand Down
57 changes: 57 additions & 0 deletions pandas/tests/indexes/ranges/test_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,26 @@ def test_range_index_rsub_by_const(self):
tm.assert_index_equal(result, expected)


def test_reindex_1_value_returns_rangeindex():
ri = RangeIndex(0, 10, 2, name="foo")
result, result_indexer = ri.reindex([2])
expected = RangeIndex(2, 4, 2, name="foo")
tm.assert_index_equal(result, expected, exact=True)

expected_indexer = np.array([1], dtype=np.intp)
tm.assert_numpy_array_equal(result_indexer, expected_indexer)


def test_reindex_empty_returns_rangeindex():
ri = RangeIndex(0, 10, 2, name="foo")
result, result_indexer = ri.reindex([])
expected = RangeIndex(0, 0, 2, name="foo")
tm.assert_index_equal(result, expected, exact=True)

expected_indexer = np.array([], dtype=np.intp)
tm.assert_numpy_array_equal(result_indexer, expected_indexer)


def test_append_non_rangeindex_return_rangeindex():
ri = RangeIndex(1)
result = ri.append(Index([1]))
Expand Down Expand Up @@ -653,6 +673,21 @@ def test_take_return_rangeindex():
tm.assert_index_equal(result, expected, exact=True)


@pytest.mark.parametrize(
"rng, exp_rng",
[
[range(5), range(3, 4)],
[range(0, -10, -2), range(-6, -8, -2)],
[range(0, 10, 2), range(6, 8, 2)],
],
)
def test_take_1_value_returns_rangeindex(rng, exp_rng):
ri = RangeIndex(rng, name="foo")
result = ri.take([3])
expected = RangeIndex(exp_rng, name="foo")
tm.assert_index_equal(result, expected, exact=True)


def test_append_one_nonempty_preserve_step():
expected = RangeIndex(0, -1, -1)
result = RangeIndex(0).append([expected])
Expand Down Expand Up @@ -695,3 +730,25 @@ def test_getitem_boolmask_wrong_length():
ri = RangeIndex(4, name="foo")
with pytest.raises(IndexError, match="Boolean index has wrong length"):
ri[[True]]


def test_getitem_integers_return_rangeindex():
result = RangeIndex(0, 10, 2, name="foo")[[0, -1]]
expected = RangeIndex(start=0, stop=16, step=8, name="foo")
tm.assert_index_equal(result, expected, exact=True)

result = RangeIndex(0, 10, 2, name="foo")[[3]]
expected = RangeIndex(start=6, stop=8, step=2, name="foo")
tm.assert_index_equal(result, expected, exact=True)


def test_getitem_empty_return_rangeindex():
result = RangeIndex(0, 10, 2, name="foo")[[]]
expected = RangeIndex(start=0, stop=0, step=1, name="foo")
tm.assert_index_equal(result, expected, exact=True)


def test_getitem_integers_return_index():
result = RangeIndex(0, 10, 2, name="foo")[[0, 1, -1]]
expected = Index([0, 2, 8], dtype="int64", name="foo")
tm.assert_index_equal(result, expected)
2 changes: 1 addition & 1 deletion pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ def test_loc_getitem_list_with_fail(self):

s.loc[[2]]

msg = f"\"None of [Index([3], dtype='{np.dtype(int)}')] are in the [index]"
msg = "None of [RangeIndex(start=3, stop=4, step=1)] are in the [index]"
with pytest.raises(KeyError, match=re.escape(msg)):
s.loc[[3]]

Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/io/pytables/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,8 @@ def test_append_to_multiple_min_itemsize(setup_path):
}
)
expected = df.iloc[[0]]
# Reading/writing RangeIndex info is not supported yet
expected.index = Index(list(range(len(expected.index))))

with ensure_clean_store(setup_path) as store:
store.append_to_multiple(
Expand Down

0 comments on commit 3d3a0b5

Please sign in to comment.