Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Read and write pandas attrs to parquet with pyarrow engine #41545

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions doc/source/development/developer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a
{'index_columns': [<descr0>, <descr1>, ...],
'column_indexes': [<ci0>, <ci1>, ..., <ciN>],
'columns': [<c0>, <c1>, ...],
'attrs': {...},
'column_attrs': {<column_name0>: {...}, <column_name1>: {...}, ...},
'pandas_version': $VERSION,
'creator': {
'library': $LIBRARY,
Expand Down Expand Up @@ -185,3 +187,49 @@ As an example of fully-formed metadata:
'library': 'pyarrow',
'version': '0.13.0'
}}


Attribute metadata
~~~~~~~~~~~~~~~~~~

.. warning:: This only works with the ``pyarrow`` engine as of ``pandas`` 1.3.

The attributes of both the ``DataFrame`` and each ``Series`` are written to and read
from using:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably also need to mention that this is an optional field?


- :attr:`DataFrame.attrs`
- :attr:`Series.attrs`

Here is an example:

.. code-block:: python

df = pd.DataFrame({"a": [1], "b": [1]})
df.attrs = {"name": "my custom dataset"}
df.a.attrs = {
"long_name": "Description about data",
"nodata": -1,
"units": "metre",
}
df.to_parquet("file.parquet")


Here is an example of the metadata:

.. code-block:: text

{
...
'attrs': {'name': 'my custom dataset'},
'column_attrs': {
'a': {
'long_name': 'Description about data',
'nodata': -1,
'units': 'metre',
},
},
'pandas_version': '1.3.0',
'creator': {
'library': 'pyarrow',
'version': '0.13.0'
}}
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ Other enhancements
- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
- :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`)
- Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`)
- Read and write :class:`DataFrame` and :class:`Series` attrs to parquet with pyarrow engine (:issue:`20521`)

.. ---------------------------------------------------------------------------

Expand Down
46 changes: 44 additions & 2 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import annotations

import io
import json
import os
from typing import (
Any,
Expand Down Expand Up @@ -142,6 +143,44 @@ def read(self, path, columns=None, **kwargs):
raise AbstractMethodError(self)


def _pyarrow_write_attrs(table: Any, df: DataFrame) -> Any:
"""
.. versionadded:: 1.3

Copy attts from pandas.DataFrame and pandas.Series to
schema metadata in pyarrow.Table.
"""
schema_metadata = table.schema.metadata or {}
pandas_metadata = json.loads(schema_metadata.get(b"pandas", "{}"))
column_attrs = {}
for col in df.columns:
attrs = df[col].attrs
if not attrs or not isinstance(col, str):
continue
column_attrs[col] = attrs
pandas_metadata.update(
attrs=df.attrs,
column_attrs=column_attrs,
)
schema_metadata[b"pandas"] = json.dumps(pandas_metadata)
return table.replace_schema_metadata(schema_metadata)


def _pyarrow_read_attrs(table: Any, df: DataFrame) -> None:
"""
.. versionadded:: 1.3

Copy schema metadata from pyarrow.Table
to attrs in pandas.DataFrame and pandas.Series.
"""
schema_metadata = table.schema.metadata or {}
pandas_metadata = json.loads(schema_metadata.get(b"pandas", "{}"))
df.attrs = pandas_metadata.get("attrs", {})
col_attrs = pandas_metadata.get("column_attrs", {})
for col in df.columns:
df[col].attrs = col_attrs.get(col, {})


class PyArrowImpl(BaseImpl):
def __init__(self):
import_optional_dependency(
Expand Down Expand Up @@ -171,6 +210,7 @@ def write(
from_pandas_kwargs["preserve_index"] = index

table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
table = _pyarrow_write_attrs(table, df)

path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
path,
Expand Down Expand Up @@ -236,9 +276,11 @@ def read(
mode="rb",
)
try:
result = self.api.parquet.read_table(
table = self.api.parquet.read_table(
path_or_handle, columns=columns, **kwargs
).to_pandas(**to_pandas_kwargs)
)
result = table.to_pandas(**to_pandas_kwargs)
snowman2 marked this conversation as resolved.
Show resolved Hide resolved
_pyarrow_read_attrs(table, result)
if manager == "array":
result = result._as_manager("array", copy=False)
return result
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,39 @@ def test_read_parquet_manager(self, pa, using_array_manager):
else:
assert isinstance(result._mgr, pd.core.internals.BlockManager)

@td.skip_if_no("pyarrow")
snowman2 marked this conversation as resolved.
Show resolved Hide resolved
def test_read_write_attrs(self, pa):
df = pd.DataFrame({"a": [1], "b": [1]})
df.attrs = {"name": "my custom dataset"}
df.a.attrs = {
"long_name": "Description about data",
"nodata": -1,
"units": "metre",
}
df.b.attrs = {}
with tm.ensure_clean() as path:
df.to_parquet(path)
result = read_parquet(path)

assert result.attrs == {"name": "my custom dataset"}
assert result.a.attrs == {
"long_name": "Description about data",
"nodata": -1,
"units": "metre",
}
assert result.b.attrs == {}

@td.skip_if_no("pyarrow")
def test_read_write_attrs__invalid(self, pa):
df = pd.DataFrame({"a": [1], "b": [1]})
df.attrs = {-1: np.array(1)}
df.a.attrs = {-1: np.array(1)}
df.b.attrs = {}
with tm.ensure_clean() as path, pytest.raises(
TypeError, match="not JSON serializable"
):
df.to_parquet(path)


class TestParquetFastParquet(Base):
def test_basic(self, fp, df_full):
Expand Down