Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pyarrow metadata #54344

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
897 changes: 897 additions & 0 deletions pandas/_libs/join 2.pyx

Large diffs are not rendered by default.

654 changes: 654 additions & 0 deletions pandas/core/computation/pytables 2.py

Large diffs are not rendered by default.

1,065 changes: 1,065 additions & 0 deletions pandas/core/groupby/grouper 2.py

Large diffs are not rendered by default.

1,107 changes: 1,107 additions & 0 deletions pandas/core/indexes/range 2.py

Large diffs are not rendered by default.

2,650 changes: 2,650 additions & 0 deletions pandas/core/reshape/merge 2.py

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
""" parquet compat """
from __future__ import annotations

import ast
import io
import json
import os
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -184,6 +186,11 @@ def write(

table = self.api.Table.from_pandas(df, **from_pandas_kwargs)

df_metadata = {"df.attrs": json.dumps(df.attrs)}
existing_metadata = table.schema.metadata
merged_metadata = {**existing_metadata, **df_metadata}
table = table.replace_schema_metadata(merged_metadata)

path_or_handle, handles, filesystem = _get_path_or_handle(
path,
filesystem,
Expand Down Expand Up @@ -263,6 +270,11 @@ def read(

if manager == "array":
result = result._as_manager("array", copy=False)

result.attrs = ast.literal_eval(
pa_table.schema.metadata[b"df.attrs"].decode("utf-8")
)

return result
finally:
if handles is not None:
Expand Down
154 changes: 154 additions & 0 deletions pandas/tests/indexes/numeric/test_setops 2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
from datetime import (
datetime,
timedelta,
)

import numpy as np
import pytest

import pandas._testing as tm
from pandas.core.indexes.api import (
Index,
RangeIndex,
)


@pytest.fixture
def index_large():
# large values used in TestUInt64Index where no compat needed with int64/float64
large = [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25]
return Index(large, dtype=np.uint64)


class TestSetOps:
@pytest.mark.parametrize("dtype", ["f8", "u8", "i8"])
def test_union_non_numeric(self, dtype):
# corner case, non-numeric
index = Index(np.arange(5, dtype=dtype), dtype=dtype)
assert index.dtype == dtype

other = Index([datetime.now() + timedelta(i) for i in range(4)], dtype=object)
result = index.union(other)
expected = Index(np.concatenate((index, other)))
tm.assert_index_equal(result, expected)

result = other.union(index)
expected = Index(np.concatenate((other, index)))
tm.assert_index_equal(result, expected)

def test_intersection(self):
index = Index(range(5), dtype=np.int64)

other = Index([1, 2, 3, 4, 5])
result = index.intersection(other)
expected = Index(np.sort(np.intersect1d(index.values, other.values)))
tm.assert_index_equal(result, expected)

result = other.intersection(index)
expected = Index(
np.sort(np.asarray(np.intersect1d(index.values, other.values)))
)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize("dtype", ["int64", "uint64"])
def test_int_float_union_dtype(self, dtype):
# https://github.com/pandas-dev/pandas/issues/26778
# [u]int | float -> float
index = Index([0, 2, 3], dtype=dtype)
other = Index([0.5, 1.5], dtype=np.float64)
expected = Index([0.0, 0.5, 1.5, 2.0, 3.0], dtype=np.float64)
result = index.union(other)
tm.assert_index_equal(result, expected)

result = other.union(index)
tm.assert_index_equal(result, expected)

def test_range_float_union_dtype(self):
# https://github.com/pandas-dev/pandas/issues/26778
index = RangeIndex(start=0, stop=3)
other = Index([0.5, 1.5], dtype=np.float64)
result = index.union(other)
expected = Index([0.0, 0.5, 1, 1.5, 2.0], dtype=np.float64)
tm.assert_index_equal(result, expected)

result = other.union(index)
tm.assert_index_equal(result, expected)

def test_float64_index_difference(self):
# https://github.com/pandas-dev/pandas/issues/35217
float_index = Index([1.0, 2, 3])
string_index = Index(["1", "2", "3"])

result = float_index.difference(string_index)
tm.assert_index_equal(result, float_index)

result = string_index.difference(float_index)
tm.assert_index_equal(result, string_index)

def test_intersection_uint64_outside_int64_range(self, index_large):
other = Index([2**63, 2**63 + 5, 2**63 + 10, 2**63 + 15, 2**63 + 20])
result = index_large.intersection(other)
expected = Index(np.sort(np.intersect1d(index_large.values, other.values)))
tm.assert_index_equal(result, expected)

result = other.intersection(index_large)
expected = Index(
np.sort(np.asarray(np.intersect1d(index_large.values, other.values)))
)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize(
"index2,keeps_name",
[
(Index([4, 7, 6, 5, 3], name="index"), True),
(Index([4, 7, 6, 5, 3], name="other"), False),
],
)
def test_intersection_monotonic(self, index2, keeps_name, sort):
index1 = Index([5, 3, 2, 4, 1], name="index")
expected = Index([5, 3, 4])

if keeps_name:
expected.name = "index"

result = index1.intersection(index2, sort=sort)
if sort is None:
expected = expected.sort_values()
tm.assert_index_equal(result, expected)

def test_symmetric_difference(self, sort):
# smoke
index1 = Index([5, 2, 3, 4], name="index1")
index2 = Index([2, 3, 4, 1])
result = index1.symmetric_difference(index2, sort=sort)
expected = Index([5, 1])
assert tm.equalContents(result, expected)
assert result.name is None
if sort is None:
expected = expected.sort_values()
tm.assert_index_equal(result, expected)


class TestSetOpsSort:
@pytest.mark.parametrize("slice_", [slice(None), slice(0)])
def test_union_sort_other_special(self, slice_):
# https://github.com/pandas-dev/pandas/issues/24959

idx = Index([1, 0, 2])
# default, sort=None
other = idx[slice_]
tm.assert_index_equal(idx.union(other), idx)
tm.assert_index_equal(other.union(idx), idx)

# sort=False
tm.assert_index_equal(idx.union(other, sort=False), idx)

@pytest.mark.parametrize("slice_", [slice(None), slice(0)])
def test_union_sort_special_true(self, slice_):
idx = Index([1, 0, 2])
# default, sort=None
other = idx[slice_]

result = idx.union(other, sort=True)
expected = Index([0, 1, 2])
tm.assert_index_equal(result, expected)
95 changes: 95 additions & 0 deletions pandas/tests/indexes/ranges/test_indexing 2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import numpy as np
import pytest

from pandas import (
Index,
RangeIndex,
)
import pandas._testing as tm


class TestGetIndexer:
def test_get_indexer(self):
index = RangeIndex(start=0, stop=20, step=2)
target = RangeIndex(10)
indexer = index.get_indexer(target)
expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected)

def test_get_indexer_pad(self):
index = RangeIndex(start=0, stop=20, step=2)
target = RangeIndex(10)
indexer = index.get_indexer(target, method="pad")
expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected)

def test_get_indexer_backfill(self):
index = RangeIndex(start=0, stop=20, step=2)
target = RangeIndex(10)
indexer = index.get_indexer(target, method="backfill")
expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected)

def test_get_indexer_limit(self):
# GH#28631
idx = RangeIndex(4)
target = RangeIndex(6)
result = idx.get_indexer(target, method="pad", limit=1)
expected = np.array([0, 1, 2, 3, 3, -1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize("stop", [0, -1, -2])
def test_get_indexer_decreasing(self, stop):
# GH#28678
index = RangeIndex(7, stop, -3)
result = index.get_indexer(range(9))
expected = np.array([-1, 2, -1, -1, 1, -1, -1, 0, -1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)


class TestTake:
def test_take_preserve_name(self):
index = RangeIndex(1, 5, name="foo")
taken = index.take([3, 0, 1])
assert index.name == taken.name

def test_take_fill_value(self):
# GH#12631
idx = RangeIndex(1, 4, name="xxx")
result = idx.take(np.array([1, 0, -1]))
expected = Index([2, 1, 3], dtype=np.int64, name="xxx")
tm.assert_index_equal(result, expected)

# fill_value
msg = "Unable to fill values because RangeIndex cannot contain NA"
with pytest.raises(ValueError, match=msg):
idx.take(np.array([1, 0, -1]), fill_value=True)

# allow_fill=False
result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
expected = Index([2, 1, 3], dtype=np.int64, name="xxx")
tm.assert_index_equal(result, expected)

msg = "Unable to fill values because RangeIndex cannot contain NA"
with pytest.raises(ValueError, match=msg):
idx.take(np.array([1, 0, -2]), fill_value=True)
with pytest.raises(ValueError, match=msg):
idx.take(np.array([1, 0, -5]), fill_value=True)

msg = "index -5 is out of bounds for (axis 0 with )?size 3"
with pytest.raises(IndexError, match=msg):
idx.take(np.array([1, -5]))


class TestWhere:
def test_where_putmask_range_cast(self):
# GH#43240
idx = RangeIndex(0, 5, name="test")

mask = np.array([True, True, False, False, False])
result = idx.putmask(mask, 10)
expected = Index([10, 10, 2, 3, 4], dtype=np.int64, name="test")
tm.assert_index_equal(result, expected)

result = idx.where(~mask, 10)
tm.assert_index_equal(result, expected)
8 changes: 8 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1192,6 +1192,14 @@ def test_partition_on_supported(self, tmp_path, fp, df_full):
actual_partition_cols = fastparquet.ParquetFile(str(tmp_path), False).cats
assert len(actual_partition_cols) == 2

def test_df_attrs_persistence(self, tmp_path):
path = tmp_path / "test_df_metadata.p"
df = pd.DataFrame(data={1: [1]})
df.attrs = {"Test attribute": 1}
df.to_parquet(path)
new_df = read_parquet(path)
assert new_df.attrs == df.attrs

def test_error_on_using_partition_cols_and_partition_on(
self, tmp_path, fp, df_full
):
Expand Down
Loading
Loading