pandas-dev · SanjithChockan · Aug 1, 2023 · Aug 1, 2023
diff --git a/pandas/core/computation/pytables 2.py b/pandas/core/computation/pytables 2.py
diff --git a/pandas/core/indexes/range 2.py b/pandas/core/indexes/range 2.py
diff --git a/pandas/core/reshape/merge 2.py b/pandas/core/reshape/merge 2.py
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -1,7 +1,9 @@
 """ parquet compat """
 from __future__ import annotations
 
+import ast
 import io
+import json
 import os
 from typing import (
     TYPE_CHECKING,
@@ -184,6 +186,11 @@ def write(
 
         table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
 
+        df_metadata = {"df.attrs": json.dumps(df.attrs)}
+        existing_metadata = table.schema.metadata
+        merged_metadata = {**existing_metadata, **df_metadata}
+        table = table.replace_schema_metadata(merged_metadata)
+
         path_or_handle, handles, filesystem = _get_path_or_handle(
             path,
             filesystem,
@@ -263,6 +270,11 @@ def read(
 
             if manager == "array":
                 result = result._as_manager("array", copy=False)
+
+            result.attrs = ast.literal_eval(
+                pa_table.schema.metadata[b"df.attrs"].decode("utf-8")
+            )
+
             return result
         finally:
             if handles is not None:

diff --git a/pandas/tests/indexes/numeric/test_setops 2.py b/pandas/tests/indexes/numeric/test_setops 2.py
@@ -0,0 +1,154 @@
+from datetime import (
+    datetime,
+    timedelta,
+)
+
+import numpy as np
+import pytest
+
+import pandas._testing as tm
+from pandas.core.indexes.api import (
+    Index,
+    RangeIndex,
+)
+
+
+@pytest.fixture
+def index_large():
+    # large values used in TestUInt64Index where no compat needed with int64/float64
+    large = [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25]
+    return Index(large, dtype=np.uint64)
+
+
+class TestSetOps:
+    @pytest.mark.parametrize("dtype", ["f8", "u8", "i8"])
+    def test_union_non_numeric(self, dtype):
+        # corner case, non-numeric
+        index = Index(np.arange(5, dtype=dtype), dtype=dtype)
+        assert index.dtype == dtype
+
+        other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object)
+        result = index.union(other)
+        expected = Index(np.concatenate((index, other)))
+        tm.assert_index_equal(result, expected)
+
+        result = other.union(index)
+        expected = Index(np.concatenate((other, index)))
+        tm.assert_index_equal(result, expected)
+
+    def test_intersection(self):
+        index = Index(range(5), dtype=np.int64)
+
+        other = Index([1, 2, 3, 4, 5])
+        result = index.intersection(other)
+        expected = Index(np.sort(np.intersect1d(index.values, other.values)))
+        tm.assert_index_equal(result, expected)
+
+        result = other.intersection(index)
+        expected = Index(
+            np.sort(np.asarray(np.intersect1d(index.values, other.values)))
+        )
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype", ["int64", "uint64"])
+    def test_int_float_union_dtype(self, dtype):
+        # https://github.com/pandas-dev/pandas/issues/26778
+        # [u]int | float -> float
+        index = Index([0, 2, 3], dtype=dtype)
+        other = Index([0.5, 1.5], dtype=np.float64)
+        expected = Index([0.0, 0.5, 1.5, 2.0, 3.0], dtype=np.float64)
+        result = index.union(other)
+        tm.assert_index_equal(result, expected)
+
+        result = other.union(index)
+        tm.assert_index_equal(result, expected)
+
+    def test_range_float_union_dtype(self):
+        # https://github.com/pandas-dev/pandas/issues/26778
+        index = RangeIndex(start=0, stop=3)
+        other = Index([0.5, 1.5], dtype=np.float64)
+        result = index.union(other)
+        expected = Index([0.0, 0.5, 1, 1.5, 2.0], dtype=np.float64)
+        tm.assert_index_equal(result, expected)
+
+        result = other.union(index)
+        tm.assert_index_equal(result, expected)
+
+    def test_float64_index_difference(self):
+        # https://github.com/pandas-dev/pandas/issues/35217
+        float_index = Index([1.0, 2, 3])
+        string_index = Index(["1", "2", "3"])
+
+        result = float_index.difference(string_index)
+        tm.assert_index_equal(result, float_index)
+
+        result = string_index.difference(float_index)
+        tm.assert_index_equal(result, string_index)
+
+    def test_intersection_uint64_outside_int64_range(self, index_large):
+        other = Index([2**63, 2**63 + 5, 2**63 + 10, 2**63 + 15, 2**63 + 20])
+        result = index_large.intersection(other)
+        expected = Index(np.sort(np.intersect1d(index_large.values, other.values)))
+        tm.assert_index_equal(result, expected)
+
+        result = other.intersection(index_large)
+        expected = Index(
+            np.sort(np.asarray(np.intersect1d(index_large.values, other.values)))
+        )
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "index2,keeps_name",
+        [
+            (Index([4, 7, 6, 5, 3], name="index"), True),
+            (Index([4, 7, 6, 5, 3], name="other"), False),
+        ],
+    )
+    def test_intersection_monotonic(self, index2, keeps_name, sort):
+        index1 = Index([5, 3, 2, 4, 1], name="index")
+        expected = Index([5, 3, 4])
+
+        if keeps_name:
+            expected.name = "index"
+
+        result = index1.intersection(index2, sort=sort)
+        if sort is None:
+            expected = expected.sort_values()
+        tm.assert_index_equal(result, expected)
+
+    def test_symmetric_difference(self, sort):
+        # smoke
+        index1 = Index([5, 2, 3, 4], name="index1")
+        index2 = Index([2, 3, 4, 1])
+        result = index1.symmetric_difference(index2, sort=sort)
+        expected = Index([5, 1])
+        assert tm.equalContents(result, expected)
+        assert result.name is None
+        if sort is None:
+            expected = expected.sort_values()
+        tm.assert_index_equal(result, expected)
+
+
+class TestSetOpsSort:
+    @pytest.mark.parametrize("slice_", [slice(None), slice(0)])
+    def test_union_sort_other_special(self, slice_):
+        # https://github.com/pandas-dev/pandas/issues/24959
+
+        idx = Index([1, 0, 2])
+        # default, sort=None
+        other = idx[slice_]
+        tm.assert_index_equal(idx.union(other), idx)
+        tm.assert_index_equal(other.union(idx), idx)
+
+        # sort=False
+        tm.assert_index_equal(idx.union(other, sort=False), idx)
+
+    @pytest.mark.parametrize("slice_", [slice(None), slice(0)])
+    def test_union_sort_special_true(self, slice_):
+        idx = Index([1, 0, 2])
+        # default, sort=None
+        other = idx[slice_]
+
+        result = idx.union(other, sort=True)
+        expected = Index([0, 1, 2])
+        tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/indexes/ranges/test_indexing 2.py b/pandas/tests/indexes/ranges/test_indexing 2.py
@@ -0,0 +1,95 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    Index,
+    RangeIndex,
+)
+import pandas._testing as tm
+
+
+class TestGetIndexer:
+    def test_get_indexer(self):
+        index = RangeIndex(start=0, stop=20, step=2)
+        target = RangeIndex(10)
+        indexer = index.get_indexer(target)
+        expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp)
+        tm.assert_numpy_array_equal(indexer, expected)
+
+    def test_get_indexer_pad(self):
+        index = RangeIndex(start=0, stop=20, step=2)
+        target = RangeIndex(10)
+        indexer = index.get_indexer(target, method="pad")
+        expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp)
+        tm.assert_numpy_array_equal(indexer, expected)
+
+    def test_get_indexer_backfill(self):
+        index = RangeIndex(start=0, stop=20, step=2)
+        target = RangeIndex(10)
+        indexer = index.get_indexer(target, method="backfill")
+        expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp)
+        tm.assert_numpy_array_equal(indexer, expected)
+
+    def test_get_indexer_limit(self):
+        # GH#28631
+        idx = RangeIndex(4)
+        target = RangeIndex(6)
+        result = idx.get_indexer(target, method="pad", limit=1)
+        expected = np.array([0, 1, 2, 3, 3, -1], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("stop", [0, -1, -2])
+    def test_get_indexer_decreasing(self, stop):
+        # GH#28678
+        index = RangeIndex(7, stop, -3)
+        result = index.get_indexer(range(9))
+        expected = np.array([-1, 2, -1, -1, 1, -1, -1, 0, -1], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+
+
+class TestTake:
+    def test_take_preserve_name(self):
+        index = RangeIndex(1, 5, name="foo")
+        taken = index.take([3, 0, 1])
+        assert index.name == taken.name
+
+    def test_take_fill_value(self):
+        # GH#12631
+        idx = RangeIndex(1, 4, name="xxx")
+        result = idx.take(np.array([1, 0, -1]))
+        expected = Index([2, 1, 3], dtype=np.int64, name="xxx")
+        tm.assert_index_equal(result, expected)
+
+        # fill_value
+        msg = "Unable to fill values because RangeIndex cannot contain NA"
+        with pytest.raises(ValueError, match=msg):
+            idx.take(np.array([1, 0, -1]), fill_value=True)
+
+        # allow_fill=False
+        result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
+        expected = Index([2, 1, 3], dtype=np.int64, name="xxx")
+        tm.assert_index_equal(result, expected)
+
+        msg = "Unable to fill values because RangeIndex cannot contain NA"
+        with pytest.raises(ValueError, match=msg):
+            idx.take(np.array([1, 0, -2]), fill_value=True)
+        with pytest.raises(ValueError, match=msg):
+            idx.take(np.array([1, 0, -5]), fill_value=True)
+
+        msg = "index -5 is out of bounds for (axis 0 with )?size 3"
+        with pytest.raises(IndexError, match=msg):
+            idx.take(np.array([1, -5]))
+
+
+class TestWhere:
+    def test_where_putmask_range_cast(self):
+        # GH#43240
+        idx = RangeIndex(0, 5, name="test")
+
+        mask = np.array([True, True, False, False, False])
+        result = idx.putmask(mask, 10)
+        expected = Index([10, 10, 2, 3, 4], dtype=np.int64, name="test")
+        tm.assert_index_equal(result, expected)
+
+        result = idx.where(~mask, 10)
+        tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -1192,6 +1192,14 @@ def test_partition_on_supported(self, tmp_path, fp, df_full):
         actual_partition_cols = fastparquet.ParquetFile(str(tmp_path), False).cats
         assert len(actual_partition_cols) == 2
 
+    def test_df_attrs_persistence(self, tmp_path):
+        path = tmp_path / "test_df_metadata.p"
+        df = pd.DataFrame(data={1: [1]})
+        df.attrs = {"Test attribute": 1}
+        df.to_parquet(path)
+        new_df = read_parquet(path)
+        assert new_df.attrs == df.attrs
+
     def test_error_on_using_partition_cols_and_partition_on(
         self, tmp_path, fp, df_full
     ):