From b284101fbcfede2ce805324407a2a3072f640fa6 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Wed, 11 Oct 2023 02:57:46 +0800 Subject: [PATCH] EHN: read_spss stores the metadata in df.attrs (#55472) * EHN: read_spss stores the metadata in df.attrs * filter warning * Make separate variable --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/spss.py | 3 ++- pandas/tests/io/test_spss.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 09067d541d01b..29a2d5c0b5877 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -76,6 +76,7 @@ Other enhancements - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`). - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`) +- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`) - :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`) - :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`) - DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 58487c6cd721b..db31a07df79e6 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -63,9 +63,10 @@ def read_spss( raise TypeError("usecols must be list-like.") usecols = list(usecols) # pyreadstat requires a list - df, _ = pyreadstat.read_sav( + df, metadata = pyreadstat.read_sav( stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals ) + df.attrs = metadata.__dict__ if dtype_backend is not lib.no_default: df = df.convert_dtypes(dtype_backend=dtype_backend) return df diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 5d8dce72d69c9..b612b64e3b020 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -116,3 +116,35 @@ def test_invalid_dtype_backend(): ) with pytest.raises(ValueError, match=msg): pd.read_spss("test", dtype_backend="numpy") + + +@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError") +def test_spss_metadata(datapath): + # GH 54264 + fname = datapath("io", "data", "spss", "labelled-num.sav") + + df = pd.read_spss(fname) + metadata = { + "column_names": ["VAR00002"], + "column_labels": [None], + "column_names_to_labels": {"VAR00002": None}, + "file_encoding": "UTF-8", + "number_columns": 1, + "number_rows": 1, + "variable_value_labels": {"VAR00002": {1.0: "This is one"}}, + "value_labels": {"labels0": {1.0: "This is one"}}, + "variable_to_label": {"VAR00002": "labels0"}, + "notes": [], + "original_variable_types": {"VAR00002": "F8.0"}, + "readstat_variable_types": {"VAR00002": "double"}, + "table_name": None, + "missing_ranges": {}, + "missing_user_values": {}, + "variable_storage_width": {"VAR00002": 8}, + "variable_display_width": {"VAR00002": 8}, + "variable_alignment": {"VAR00002": "unknown"}, + "variable_measure": {"VAR00002": "unknown"}, + "file_label": None, + "file_format": "sav/zsav", + } + assert df.attrs == metadata