From 152595cae908836f90b3c2b89ce21eb0c8777dd0 Mon Sep 17 00:00:00 2001 From: Sanjith Chockan <36017129+SanjithChockan@users.noreply.github.com> Date: Thu, 3 Aug 2023 14:39:35 -0600 Subject: [PATCH] Parquet metadata persistence of DataFrame.attrs (#54346) * added df.attrs metadata to pyarrow table for persistence * hooks * placed unit test in correct class * update unit test * changed to consistent use of json * added whatsnew * added gaurd to check if df.attrs exists * updated whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/io/parquet.py | 12 ++++++++++++ pandas/tests/io/test_parquet.py | 8 ++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 861d802d3ba62..74df950ae0f0c 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -212,8 +212,8 @@ Other enhancements - Improved error message when :meth:`DataFrameGroupBy.agg` failed (:issue:`52930`) - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) - Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype objects (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`) +- :meth:`DataFrame.to_parquet` and :func:`read_parquet` will now write and read ``attrs`` respectively (:issue:`54346`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`51722`) -- .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 90d59b0dfcfc8..aaf7710ac0986 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -2,6 +2,7 @@ from __future__ import annotations import io +import json import os from typing import ( TYPE_CHECKING, @@ -184,6 +185,12 @@ def write( table = self.api.Table.from_pandas(df, **from_pandas_kwargs) + if df.attrs: + df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)} + existing_metadata = table.schema.metadata + merged_metadata = {**existing_metadata, **df_metadata} + table = table.replace_schema_metadata(merged_metadata) + path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, @@ -268,6 +275,11 @@ def read( if manager == "array": result = result._as_manager("array", copy=False) + + if pa_table.schema.metadata: + if b"PANDAS_ATTRS" in pa_table.schema.metadata: + df_metadata = pa_table.schema.metadata[b"PANDAS_ATTRS"] + result.attrs = json.loads(df_metadata) return result finally: if handles is not None: diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 283d86227c79e..501e471695a8a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1098,6 +1098,14 @@ def test_empty_columns(self, pa): df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) check_round_trip(df, pa) + def test_df_attrs_persistence(self, tmp_path, pa): + path = tmp_path / "test_df_metadata.p" + df = pd.DataFrame(data={1: [1]}) + df.attrs = {"test_attribute": 1} + df.to_parquet(path, engine=pa) + new_df = read_parquet(path, engine=pa) + assert new_df.attrs == df.attrs + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full):