pandas-dev · snowman2 · May 18, 2021 · May 19, 2021 · May 26, 2021 · May 26, 2021
diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst
@@ -40,6 +40,8 @@ So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a
    {'index_columns': [<descr0>, <descr1>, ...],
     'column_indexes': [<ci0>, <ci1>, ..., <ciN>],
     'columns': [<c0>, <c1>, ...],
+    'attrs': {...},
+    'column_attrs': {<column_name0>: {...}, <column_name1>: {...}, ...},
     'pandas_version': $VERSION,
     'creator': {
       'library': $LIBRARY,
@@ -185,3 +187,49 @@ As an example of fully-formed metadata:
       'library': 'pyarrow',
       'version': '0.13.0'
     }}
+
+
+Attribute metadata
+~~~~~~~~~~~~~~~~~~
+
+.. warning:: This only works with the ``pyarrow`` engine as of ``pandas`` 1.3.
+
+The attributes of both the ``DataFrame`` and each ``Series`` are written to and read
+from using:
+
+- :attr:`DataFrame.attrs`
+- :attr:`Series.attrs`
+
+Here is an example:
+
+.. code-block:: python
+
+    df = pd.DataFrame({"a": [1], "b": [1]})
+    df.attrs = {"name": "my custom dataset"}
+    df.a.attrs = {
+        "long_name": "Description about data",
+        "nodata": -1,
+        "units": "metre",
+    }
+    df.to_parquet("file.parquet")
+
+
+Here is an example of the metadata:
+
+.. code-block:: text
+
+   {
+   ...
+    'attrs': {'name': 'my custom dataset'},
+    'column_attrs': {
+        'a': {
+            'long_name': 'Description about data',
+            'nodata': -1,
+            'units': 'metre',
+        },
+    },
+    'pandas_version': '1.3.0',
+    'creator': {
+      'library': 'pyarrow',
+      'version': '0.13.0'
+    }}
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -234,6 +234,7 @@ Other enhancements
 - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
 - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`)
 - Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`)
+- Read and write :class:`DataFrame` and :class:`Series` attrs to parquet with pyarrow engine (:issue:`20521`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import io
+import json
 import os
 from typing import (
     Any,
@@ -142,6 +143,44 @@ def read(self, path, columns=None, **kwargs):
         raise AbstractMethodError(self)
 
 
+def _pyarrow_write_attrs(table: Any, df: DataFrame) -> Any:
+    """
+    .. versionadded:: 1.3
+
+    Copy attts from pandas.DataFrame and pandas.Series to
+    schema metadata in pyarrow.Table.
+    """
+    schema_metadata = table.schema.metadata or {}
+    pandas_metadata = json.loads(schema_metadata.get(b"pandas", "{}"))
+    column_attrs = {}
+    for col in df.columns:
+        attrs = df[col].attrs
+        if not attrs or not isinstance(col, str):
+            continue
+        column_attrs[col] = attrs
+    pandas_metadata.update(
+        attrs=df.attrs,
+        column_attrs=column_attrs,
+    )
+    schema_metadata[b"pandas"] = json.dumps(pandas_metadata)
+    return table.replace_schema_metadata(schema_metadata)
+
+
+def _pyarrow_read_attrs(table: Any, df: DataFrame) -> None:
+    """
+    .. versionadded:: 1.3
+
+    Copy schema metadata from pyarrow.Table
+    to attrs in pandas.DataFrame and pandas.Series.
+    """
+    schema_metadata = table.schema.metadata or {}
+    pandas_metadata = json.loads(schema_metadata.get(b"pandas", "{}"))
+    df.attrs = pandas_metadata.get("attrs", {})
+    col_attrs = pandas_metadata.get("column_attrs", {})
+    for col in df.columns:
+        df[col].attrs = col_attrs.get(col, {})
+
+
 class PyArrowImpl(BaseImpl):
     def __init__(self):
         import_optional_dependency(
@@ -171,6 +210,7 @@ def write(
             from_pandas_kwargs["preserve_index"] = index
 
         table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
+        table = _pyarrow_write_attrs(table, df)
 
         path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
             path,
@@ -236,9 +276,11 @@ def read(
             mode="rb",
         )
         try:
-            result = self.api.parquet.read_table(
+            table = self.api.parquet.read_table(
                 path_or_handle, columns=columns, **kwargs
-            ).to_pandas(**to_pandas_kwargs)
+            )
+            result = table.to_pandas(**to_pandas_kwargs)
+            _pyarrow_read_attrs(table, result)
             if manager == "array":
                 result = result._as_manager("array", copy=False)
             return result

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -907,6 +907,39 @@ def test_read_parquet_manager(self, pa, using_array_manager):
         else:
             assert isinstance(result._mgr, pd.core.internals.BlockManager)
 
+    @td.skip_if_no("pyarrow")
+    def test_read_write_attrs(self, pa):
+        df = pd.DataFrame({"a": [1], "b": [1]})
+        df.attrs = {"name": "my custom dataset"}
+        df.a.attrs = {
+            "long_name": "Description about data",
+            "nodata": -1,
+            "units": "metre",
+        }
+        df.b.attrs = {}
+        with tm.ensure_clean() as path:
+            df.to_parquet(path)
+            result = read_parquet(path)
+
+        assert result.attrs == {"name": "my custom dataset"}
+        assert result.a.attrs == {
+            "long_name": "Description about data",
+            "nodata": -1,
+            "units": "metre",
+        }
+        assert result.b.attrs == {}
+
+    @td.skip_if_no("pyarrow")
+    def test_read_write_attrs__invalid(self, pa):
+        df = pd.DataFrame({"a": [1], "b": [1]})
+        df.attrs = {-1: np.array(1)}
+        df.a.attrs = {-1: np.array(1)}
+        df.b.attrs = {}
+        with tm.ensure_clean() as path, pytest.raises(
+            TypeError, match="not JSON serializable"
+        ):
+            df.to_parquet(path)
+
 
 class TestParquetFastParquet(Base):
     def test_basic(self, fp, df_full):