From 930d24299966a17481914929bd461865446fea5b Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 30 May 2024 14:48:58 +0200
Subject: [PATCH 01/67] Ensure correct boolean dtype in misc table index

---
 audformat/core/table.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 18a4b863..f2d18d2f 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -847,6 +847,11 @@ def _load_csv(self, path: str):
             float_precision="round_trip",
         )
 
+        # Ensure bool values are stored as boolean,
+        # as pandas.read_csv()
+        # does not set this correctly
+        df.index = utils._maybe_convert_pandas_dtype(df.index)
+
         # For an empty CSV file
         # converters will not set the correct dtype
         # and we need to correct it manually

From 8d38ba96bdd1e1d4db863707446afcd2520c7f55 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 30 May 2024 15:09:54 +0200
Subject: [PATCH 02/67] Remove unneeded code

---
 audformat/core/table.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index f2d18d2f..18a4b863 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -847,11 +847,6 @@ def _load_csv(self, path: str):
             float_precision="round_trip",
         )
 
-        # Ensure bool values are stored as boolean,
-        # as pandas.read_csv()
-        # does not set this correctly
-        df.index = utils._maybe_convert_pandas_dtype(df.index)
-
         # For an empty CSV file
         # converters will not set the correct dtype
         # and we need to correct it manually

From 06f3a34064d663f18ae6bf72fb7f9bdfa4218b54 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 20 Mar 2024 13:32:14 +0100
Subject: [PATCH 03/67] Use pyarrow to read CSV files

---
 audformat/core/common.py |  31 +++++++
 audformat/core/table.py  | 180 ++++++++++++++++++++++++++++-----------
 pyproject.toml           |   1 +
 3 files changed, 164 insertions(+), 48 deletions(-)

diff --git a/audformat/core/common.py b/audformat/core/common.py
index 833b44fc..e8733735 100644
--- a/audformat/core/common.py
+++ b/audformat/core/common.py
@@ -6,6 +6,7 @@
 
 import oyaml as yaml
 import pandas as pd
+import pyarrow as pa
 
 from audformat import define
 from audformat.core.errors import BadKeyError
@@ -388,3 +389,33 @@ def to_pandas_dtype(dtype: str) -> typing.Optional[str]:
         return "string"
     elif dtype == define.DataType.TIME:
         return "timedelta64[ns]"
+
+
+def to_pyarrow_dtype(dtype: str) -> typing.Optional[str]:
+    r"""Convert audformat to pyarrow dtype.
+
+    For ``"object"`` as ``dtype``
+    there is no equivalent,
+    and we don't return a value here.
+    We let ``pyarrow`` decide,
+    which dtype fits best in that case.
+
+    Args:
+        dtype: audformat dtype
+
+    Returns:
+        pyarrow dtype
+
+    """
+    if dtype == define.DataType.BOOL:
+        return pa.bool_()
+    elif dtype == define.DataType.DATE:
+        return pa.timestamp("ns")
+    elif dtype == define.DataType.FLOAT:
+        return pa.float64()
+    elif dtype == define.DataType.INTEGER:
+        return pa.int64()
+    elif dtype == define.DataType.STRING:
+        return pa.string()
+    elif dtype == define.DataType.TIME:
+        return pa.string()
diff --git a/audformat/core/table.py b/audformat/core/table.py
index 18a4b863..e3a287b2 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -6,6 +6,8 @@
 import typing
 
 import pandas as pd
+import pyarrow as pa
+import pyarrow.csv as csv
 
 import audeer
 
@@ -15,7 +17,7 @@
 from audformat.core.common import HeaderBase
 from audformat.core.common import HeaderDict
 from audformat.core.common import to_audformat_dtype
-from audformat.core.common import to_pandas_dtype
+from audformat.core.common import to_pyarrow_dtype
 from audformat.core.errors import BadIdError
 from audformat.core.index import filewise_index
 from audformat.core.index import index_type
@@ -801,68 +803,150 @@ def _get_by_index(
         raise NotImplementedError()
 
     def _load_csv(self, path: str):
+        r"""Load table from CSV file.
+
+        The loaded table is stored under ``self._df``.
+
+        Loading a CSV file with :func:`pd.read_csv()` is slower
+        than the method applied here.
+        We first load the CSV file as a :class:`pa.Table`
+        and convert it to a dataframe afterwards.
+
+        Args:
+            path: path to table, including file extension
+
+        """
         schemes = self.db.schemes
-        converters = {}
-        dtypes = {}
 
+        # === DTYPES ===
+
+        # Collect dtypes
+        # of the CSV file,
+        # by inspecting the audformat schemes,
+        # and the index
+        # associated with the table.
+        dtypes = []
+
+        # Collect columns,
+        # that cannot directly be converted to pyarrow
+        timedelta_columns = []
+        boolean_columns = []
+        object_columns = []
+        integer_columns = []
+
+        # --- Index ---
         if hasattr(self, "type"):
             # filewise or segmented table
-            dtypes[define.IndexField.FILE] = define.DataType.STRING
+            index_columns = []
+            name = define.IndexField.FILE
+            dtypes.append((name, to_pyarrow_dtype(define.DataType.STRING)))
+            index_columns.append(name)
             if self.type == define.IndexType.SEGMENTED:
-                dtypes[define.IndexField.START] = define.DataType.TIME
-                dtypes[define.IndexField.END] = define.DataType.TIME
+                for name in [define.IndexField.START, define.IndexField.END]:
+                    dtypes.append((name, to_pyarrow_dtype(define.DataType.TIME)))
+                    index_columns.append(name)
+                    timedelta_columns.append(name)
         else:
             # misc table
-            dtypes = self.levels
-
-        # index columns
-        levels = list(dtypes)
-        dtypes = {level: to_pandas_dtype(dtype) for level, dtype in dtypes.items()}
+            index_columns = list(self.levels.keys())
+            for name, dtype in self.levels.items():
+                _dtype = to_pyarrow_dtype(dtype)
+                if _dtype is not None:
+                    dtypes.append((name, _dtype))
+                    if dtype == define.DataType.TIME:
+                        timedelta_columns.append(name)
+                    elif dtype == define.DataType.INTEGER:
+                        integer_columns.append(name)
+                else:
+                    object_columns.append(name)
 
-        # other columns
+        # --- Columns ---
+        categories = {}
         columns = list(self.columns)
         for column_id, column in self.columns.items():
             if column.scheme_id is not None:
-                dtypes[column_id] = schemes[column.scheme_id].to_pandas_dtype()
-            else:
-                dtypes[column_id] = "object"
-
-        # replace dtype with converter for dates or timestamps
-        dtypes_wo_converters = {}
-        for column_id, dtype in dtypes.items():
-            if dtype == "datetime64[ns]":
-                converters[column_id] = lambda x: pd.to_datetime(x)
-            elif dtype == "timedelta64[ns]":
-                converters[column_id] = lambda x: pd.to_timedelta(x)
+                scheme = schemes[column.scheme_id]
+                if scheme.labels is not None:
+                    categories[column_id] = scheme._labels_to_list()
+                dtype = to_pyarrow_dtype(scheme.dtype)
+                if dtype is not None:
+                    dtypes.append((column_id, dtype))
+                    if scheme.dtype == define.DataType.TIME:
+                        timedelta_columns.append(column_id)
+                    elif scheme.dtype == define.DataType.BOOL:
+                        boolean_columns.append(column_id)
+                    elif scheme.dtype == define.DataType.INTEGER:
+                        integer_columns.append(column_id)
+                else:
+                    object_columns.append(column_id)
             else:
-                dtypes_wo_converters[column_id] = dtype
+                object_columns.append(column_id)
 
-        # read csv
-        df = pd.read_csv(
+        schema = pa.schema(dtypes)
+        table = csv.read_csv(
             path,
-            usecols=levels + columns,
-            dtype=dtypes_wo_converters,
-            index_col=levels,
-            converters=converters,
-            float_precision="round_trip",
+            read_options=csv.ReadOptions(
+                column_names=index_columns + columns,
+                skip_rows=1,
+            ),
+            convert_options=csv.ConvertOptions(
+                column_types=schema,
+                strings_can_be_null=True,
+            ),
         )
-
-        # For an empty CSV file
-        # converters will not set the correct dtype
-        # and we need to correct it manually
-        if len(df) == 0:
-            # fix index
-            converter_dtypes = {
-                level: dtype
-                for level, dtype in dtypes.items()
-                if level in converters and level in levels
-            }
-            df.index = utils.set_index_dtypes(df.index, converter_dtypes)
-            # fix columns
-            for column_id in columns:
-                if column_id in converters:
-                    dtype = dtypes[column_id]
-                    df[column_id] = df[column_id].astype(dtype)
+        df = table.to_pandas(
+            deduplicate_objects=False,
+            types_mapper={
+                pa.string(): pd.StringDtype(),
+            }.get,  # we have to provide a callable, not a dict
+        )
+        # Free no longer needed memory
+        del table
+        # Adjust dtypes, that cannot be handled by pyarrow
+        for column in timedelta_columns:
+            if len(df) == 0:
+                # For an empty dataframe, map() will not set the correct dtype
+                df[column] = df[column].astype("timedelta64[ns]")
+            else:
+                df[column] = df[column].map(
+                    # "coerce" will set errors to NaT,
+                    # and catches the case where the input is already <NA>
+                    lambda x: pd.to_timedelta(x, errors="coerce")
+                )
+        for column in boolean_columns:
+            df[column] = df[column].map(lambda x: pd.NA if x is None else x)
+            df[column] = df[column].astype(pd.BooleanDtype())
+        for column in object_columns:
+            df[column] = df[column].astype("object")
+            df[column] = df[column].replace(pd.NA, None)
+        for column in integer_columns:
+            df[column] = df[column].astype("Int64")
+        for column, labels in categories.items():
+            if len(labels) > 0 and isinstance(labels[0], int):
+                # allow nullable
+                labels = pd.array(labels, dtype="int64")
+            dtype = pd.api.types.CategoricalDtype(
+                categories=labels,
+                ordered=False,
+            )
+            df[column] = df[column].astype(dtype)
+
+        # Set index
+        #
+        # When assigning more than one column,
+        # a MultiIndex is assigned.
+        # As the MultiIndex does not preserve dtypes,
+        # we need to set them manually.
+        #
+        if len(index_columns) > 0:
+            index_dtypes = {column: df[column].dtype for column in index_columns}
+        df.set_index(index_columns, inplace=True)
+        if len(index_columns) > 1:
+            df.index = utils.set_index_dtypes(df.index, index_dtypes)
+        elif len(index_columns) > 0:
+            # Ensure pd.BooleanDtype is used for pd.Index
+            if index_dtypes[index_columns[0]] == bool:
+                df.index = df.index.astype(pd.BooleanDtype())
 
         self._df = df
 
diff --git a/pyproject.toml b/pyproject.toml
index 3d263b93..1100e75d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
     'iso-639',
     'iso3166',
     'oyaml',
+    'pyarrow',
     'pyyaml >=5.4.1',
     'pandas >=1.4.1',
 ]

From e5045d03813d638144c152e1d3753b5528dfcbf1 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 30 May 2024 15:58:26 +0200
Subject: [PATCH 04/67] Start debugging

---
 audformat/core/table.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index e3a287b2..337a1bd2 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -915,7 +915,7 @@ def _load_csv(self, path: str):
                 )
         for column in boolean_columns:
             df[column] = df[column].map(lambda x: pd.NA if x is None else x)
-            df[column] = df[column].astype(pd.BooleanDtype())
+            df[column] = df[column].astype("boolean")
         for column in object_columns:
             df[column] = df[column].astype("object")
             df[column] = df[column].replace(pd.NA, None)
@@ -940,13 +940,11 @@ def _load_csv(self, path: str):
         #
         if len(index_columns) > 0:
             index_dtypes = {column: df[column].dtype for column in index_columns}
+            print(f"{self.levels=}")
+            print(f"{index_dtypes=}")
         df.set_index(index_columns, inplace=True)
-        if len(index_columns) > 1:
+        if len(index_columns) > 0:
             df.index = utils.set_index_dtypes(df.index, index_dtypes)
-        elif len(index_columns) > 0:
-            # Ensure pd.BooleanDtype is used for pd.Index
-            if index_dtypes[index_columns[0]] == bool:
-                df.index = df.index.astype(pd.BooleanDtype())
 
         self._df = df
 

From 463c15f1e4bbc33fa71bfc143ad2b64583eaa0e9 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 30 May 2024 16:20:26 +0200
Subject: [PATCH 05/67] Continue debugging

---
 audformat/core/table.py  | 21 +++++++++++++++------
 tests/test_misc_table.py |  1 +
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 337a1bd2..ab0a6655 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -857,9 +857,12 @@ def _load_csv(self, path: str):
                         timedelta_columns.append(name)
                     elif dtype == define.DataType.INTEGER:
                         integer_columns.append(name)
+                    elif dtype == define.DataType.BOOL:
+                        boolean_columns.append(name)
                 else:
                     object_columns.append(name)
 
+        print(f"{dtypes=}")
         # --- Columns ---
         categories = {}
         columns = list(self.columns)
@@ -894,12 +897,14 @@ def _load_csv(self, path: str):
                 strings_can_be_null=True,
             ),
         )
+        print(f"{table=}")
         df = table.to_pandas(
             deduplicate_objects=False,
             types_mapper={
                 pa.string(): pd.StringDtype(),
             }.get,  # we have to provide a callable, not a dict
         )
+        print(f"{df=}")
         # Free no longer needed memory
         del table
         # Adjust dtypes, that cannot be handled by pyarrow
@@ -938,13 +943,17 @@ def _load_csv(self, path: str):
         # As the MultiIndex does not preserve dtypes,
         # we need to set them manually.
         #
-        if len(index_columns) > 0:
-            index_dtypes = {column: df[column].dtype for column in index_columns}
-            print(f"{self.levels=}")
-            print(f"{index_dtypes=}")
+        # if len(index_columns) > 0:
+        #     index_dtypes = {column: df[column].dtype for column in index_columns}
+        #     dtypes = {
+        #         level: to_pandas_dtype(dtype)
+        #         for level, dtype in self.levels.items()
+        #     }
+        #     print(f"{self.levels=}")
+        #     print(f"{index_dtypes=}")
         df.set_index(index_columns, inplace=True)
-        if len(index_columns) > 0:
-            df.index = utils.set_index_dtypes(df.index, index_dtypes)
+        # if len(index_columns) > 0:
+        #    df.index = utils.set_index_dtypes(df.index, index_dtypes)
 
         self._df = df
 
diff --git a/tests/test_misc_table.py b/tests/test_misc_table.py
index 7d9bfa41..48da78f9 100644
--- a/tests/test_misc_table.py
+++ b/tests/test_misc_table.py
@@ -907,6 +907,7 @@ def test_dtype_multiindex(
     assert list(db["misc"].levels.values()) == expected_audformat_dtypes
     assert list(db["misc"].index.dtypes) == expected_pandas_dtypes
 
+    print(f"{db['misc'].index=}")
     db_root = tmpdir.join("db")
     db.save(db_root, storage_format="csv")
     db_new = audformat.Database.load(db_root)

From e0b831ef12136b85177fa9a7cd719f23262dac46 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 30 May 2024 16:36:11 +0200
Subject: [PATCH 06/67] Fix tests

---
 audformat/core/table.py  | 21 ++++++---------------
 tests/test_misc_table.py |  8 +++++++-
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index ab0a6655..3741d74d 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -862,7 +862,6 @@ def _load_csv(self, path: str):
                 else:
                     object_columns.append(name)
 
-        print(f"{dtypes=}")
         # --- Columns ---
         categories = {}
         columns = list(self.columns)
@@ -897,14 +896,12 @@ def _load_csv(self, path: str):
                 strings_can_be_null=True,
             ),
         )
-        print(f"{table=}")
         df = table.to_pandas(
             deduplicate_objects=False,
             types_mapper={
                 pa.string(): pd.StringDtype(),
             }.get,  # we have to provide a callable, not a dict
         )
-        print(f"{df=}")
         # Free no longer needed memory
         del table
         # Adjust dtypes, that cannot be handled by pyarrow
@@ -940,20 +937,14 @@ def _load_csv(self, path: str):
         #
         # When assigning more than one column,
         # a MultiIndex is assigned.
-        # As the MultiIndex does not preserve dtypes,
-        # we need to set them manually.
+        # As the MultiIndex does not preserve pandas dtypes,
+        # we need to restore them manually.
         #
-        # if len(index_columns) > 0:
-        #     index_dtypes = {column: df[column].dtype for column in index_columns}
-        #     dtypes = {
-        #         level: to_pandas_dtype(dtype)
-        #         for level, dtype in self.levels.items()
-        #     }
-        #     print(f"{self.levels=}")
-        #     print(f"{index_dtypes=}")
+        if len(index_columns) > 1:
+            index_dtypes = {column: df[column].dtype for column in index_columns}
         df.set_index(index_columns, inplace=True)
-        # if len(index_columns) > 0:
-        #    df.index = utils.set_index_dtypes(df.index, index_dtypes)
+        if len(index_columns) > 1:
+            df.index = utils.set_index_dtypes(df.index, index_dtypes)
 
         self._df = df
 
diff --git a/tests/test_misc_table.py b/tests/test_misc_table.py
index 48da78f9..683ad4a6 100644
--- a/tests/test_misc_table.py
+++ b/tests/test_misc_table.py
@@ -511,6 +511,13 @@ def test_dtype_column(
     "index_object, index_values, index_dtype, "
     "expected_pandas_dtype, expected_audformat_dtype",
     [
+        (
+            pd.Index,
+            ["0"],
+            None,
+            "object",
+            audformat.define.DataType.OBJECT,
+        ),
         (
             pd.Index,
             [],
@@ -907,7 +914,6 @@ def test_dtype_multiindex(
     assert list(db["misc"].levels.values()) == expected_audformat_dtypes
     assert list(db["misc"].index.dtypes) == expected_pandas_dtypes
 
-    print(f"{db['misc'].index=}")
     db_root = tmpdir.join("db")
     db.save(db_root, storage_format="csv")
     db_new = audformat.Database.load(db_root)

From f48a00b87c872b1077d4265e521201d594b6d839 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Fri, 31 May 2024 10:54:54 +0200
Subject: [PATCH 07/67] Remove unneeded code

---
 audformat/core/table.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 3741d74d..354ebe1e 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -902,8 +902,6 @@ def _load_csv(self, path: str):
                 pa.string(): pd.StringDtype(),
             }.get,  # we have to provide a callable, not a dict
         )
-        # Free no longer needed memory
-        del table
         # Adjust dtypes, that cannot be handled by pyarrow
         for column in timedelta_columns:
             if len(df) == 0:

From b548774c65dca434f840dcc61935493064edade1 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Fri, 31 May 2024 13:35:36 +0200
Subject: [PATCH 08/67] Improve code

---
 audformat/core/common.py |  3 ++
 audformat/core/table.py  | 81 +++++++++++++++++++---------------------
 2 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/audformat/core/common.py b/audformat/core/common.py
index e8733735..5888fea9 100644
--- a/audformat/core/common.py
+++ b/audformat/core/common.py
@@ -418,4 +418,7 @@ def to_pyarrow_dtype(dtype: str) -> typing.Optional[str]:
     elif dtype == define.DataType.STRING:
         return pa.string()
     elif dtype == define.DataType.TIME:
+        # A better fitting type would be `pa.duration("ns")`,
+        # but this is not yet supported
+        # when reading CSV files
         return pa.string()
diff --git a/audformat/core/table.py b/audformat/core/table.py
index 354ebe1e..3feb398b 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -807,9 +807,9 @@ def _load_csv(self, path: str):
 
         The loaded table is stored under ``self._df``.
 
-        Loading a CSV file with :func:`pd.read_csv()` is slower
+        Loading a CSV file with :func:`pandas.read_csv()` is slower
         than the method applied here.
-        We first load the CSV file as a :class:`pa.Table`
+        We first load the CSV file as a :class:`pyarrow.Table`
         and convert it to a dataframe afterwards.
 
         Args:
@@ -820,47 +820,53 @@ def _load_csv(self, path: str):
 
         # === DTYPES ===
 
-        # Collect dtypes
+        # Collect pyarrow dtypes
         # of the CSV file,
         # by inspecting the audformat schemes,
         # and the index
         # associated with the table.
-        dtypes = []
+        # The dtypes are used to create
+        # the pyarrow.Schema
+        # used when reading the CSV file
+        pyarrow_dtypes = []
 
         # Collect columns,
-        # that cannot directly be converted to pyarrow
+        # that cannot directly be converted
+        # from pyarrow to pandas
         timedelta_columns = []
         boolean_columns = []
         object_columns = []
         integer_columns = []
 
+        # Collect columns,
+        # belonging to the index
+        index_columns = []
+
         # --- Index ---
         if hasattr(self, "type"):
+            levels = {}
             # filewise or segmented table
-            index_columns = []
-            name = define.IndexField.FILE
-            dtypes.append((name, to_pyarrow_dtype(define.DataType.STRING)))
-            index_columns.append(name)
+            levels[define.IndexField.FILE] = define.DataType.STRING
             if self.type == define.IndexType.SEGMENTED:
-                for name in [define.IndexField.START, define.IndexField.END]:
-                    dtypes.append((name, to_pyarrow_dtype(define.DataType.TIME)))
-                    index_columns.append(name)
-                    timedelta_columns.append(name)
+                # segmented table
+                for level in [define.IndexField.START, define.IndexField.END]:
+                    levels[level] = define.DataType.TIME
         else:
             # misc table
-            index_columns = list(self.levels.keys())
-            for name, dtype in self.levels.items():
-                _dtype = to_pyarrow_dtype(dtype)
-                if _dtype is not None:
-                    dtypes.append((name, _dtype))
-                    if dtype == define.DataType.TIME:
-                        timedelta_columns.append(name)
-                    elif dtype == define.DataType.INTEGER:
-                        integer_columns.append(name)
-                    elif dtype == define.DataType.BOOL:
-                        boolean_columns.append(name)
-                else:
-                    object_columns.append(name)
+            levels = self.levels
+        index_columns += list(levels.keys())
+        for name, dtype in levels.items():
+            pyarrow_dtype = to_pyarrow_dtype(dtype)
+            if pyarrow_dtype is not None:
+                pyarrow_dtypes.append((name, pyarrow_dtype))
+                if dtype == define.DataType.TIME:
+                    timedelta_columns.append(name)
+                elif dtype == define.DataType.INTEGER:
+                    integer_columns.append(name)
+                elif dtype == define.DataType.BOOL:
+                    boolean_columns.append(name)
+            else:
+                object_columns.append(name)
 
         # --- Columns ---
         categories = {}
@@ -870,9 +876,9 @@ def _load_csv(self, path: str):
                 scheme = schemes[column.scheme_id]
                 if scheme.labels is not None:
                     categories[column_id] = scheme._labels_to_list()
-                dtype = to_pyarrow_dtype(scheme.dtype)
-                if dtype is not None:
-                    dtypes.append((column_id, dtype))
+                pyarrow_dtype = to_pyarrow_dtype(scheme.dtype)
+                if pyarrow_dtype is not None:
+                    pyarrow_dtypes.append((column_id, pyarrow_dtype))
                     if scheme.dtype == define.DataType.TIME:
                         timedelta_columns.append(column_id)
                     elif scheme.dtype == define.DataType.BOOL:
@@ -884,7 +890,7 @@ def _load_csv(self, path: str):
             else:
                 object_columns.append(column_id)
 
-        schema = pa.schema(dtypes)
+        schema = pa.schema(pyarrow_dtypes)
         table = csv.read_csv(
             path,
             read_options=csv.ReadOptions(
@@ -904,17 +910,8 @@ def _load_csv(self, path: str):
         )
         # Adjust dtypes, that cannot be handled by pyarrow
         for column in timedelta_columns:
-            if len(df) == 0:
-                # For an empty dataframe, map() will not set the correct dtype
-                df[column] = df[column].astype("timedelta64[ns]")
-            else:
-                df[column] = df[column].map(
-                    # "coerce" will set errors to NaT,
-                    # and catches the case where the input is already <NA>
-                    lambda x: pd.to_timedelta(x, errors="coerce")
-                )
+            df[column] = df[column].astype("timedelta64[ns]")
         for column in boolean_columns:
-            df[column] = df[column].map(lambda x: pd.NA if x is None else x)
             df[column] = df[column].astype("boolean")
         for column in object_columns:
             df[column] = df[column].astype("object")
@@ -935,8 +932,8 @@ def _load_csv(self, path: str):
         #
         # When assigning more than one column,
         # a MultiIndex is assigned.
-        # As the MultiIndex does not preserve pandas dtypes,
-        # we need to restore them manually.
+        # Setting a MultiIndex does not always preserve pandas dtypes,
+        # so we need to set them manually.
         #
         if len(index_columns) > 1:
             index_dtypes = {column: df[column].dtype for column in index_columns}

From abb07d9d81632f138b76c505a79e8b3845bf31d0 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Fri, 31 May 2024 13:44:34 +0200
Subject: [PATCH 09/67] Fix test for older pandas versions

---
 audformat/core/table.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 3feb398b..19c662d6 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -818,7 +818,7 @@ def _load_csv(self, path: str):
         """
         schemes = self.db.schemes
 
-        # === DTYPES ===
+        # === Infer dtypes ===
 
         # Collect pyarrow dtypes
         # of the CSV file,
@@ -890,6 +890,7 @@ def _load_csv(self, path: str):
             else:
                 object_columns.append(column_id)
 
+        # === Read CSV ===
         schema = pa.schema(pyarrow_dtypes)
         table = csv.read_csv(
             path,
@@ -908,8 +909,14 @@ def _load_csv(self, path: str):
                 pa.string(): pd.StringDtype(),
             }.get,  # we have to provide a callable, not a dict
         )
+
+        # === Adjust dtypes ===
+
         # Adjust dtypes, that cannot be handled by pyarrow
         for column in timedelta_columns:
+            # Older versions of pandas cannot convert None to timedelta
+            # df[column] = df[column].map(lambda x: pd.NA if x is None else x)
+            df[column] = df[column].fillna(pd.NA)
             df[column] = df[column].astype("timedelta64[ns]")
         for column in boolean_columns:
             df[column] = df[column].astype("boolean")
@@ -928,8 +935,8 @@ def _load_csv(self, path: str):
             )
             df[column] = df[column].astype(dtype)
 
-        # Set index
-        #
+        # === Set index ===
+
         # When assigning more than one column,
         # a MultiIndex is assigned.
         # Setting a MultiIndex does not always preserve pandas dtypes,

From 48c9da580918c7a59042e0263f3543d7874cb8cf Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Fri, 31 May 2024 13:45:01 +0200
Subject: [PATCH 10/67] Exclude benchmark folder from tests

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 1100e75d..3d497a66 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,6 +79,7 @@ addopts = '''
     --cov-report term-missing
     --cov-report xml
     --ignore=docs/
+    --ignore=benchmarks/
 '''
 
 

From e556c90509fc349156e3e123c28023f2382efad1 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Fri, 31 May 2024 13:58:28 +0200
Subject: [PATCH 11/67] Test other implementation

---
 audformat/core/table.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 19c662d6..53ead674 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -915,8 +915,8 @@ def _load_csv(self, path: str):
         # Adjust dtypes, that cannot be handled by pyarrow
         for column in timedelta_columns:
             # Older versions of pandas cannot convert None to timedelta
-            # df[column] = df[column].map(lambda x: pd.NA if x is None else x)
-            df[column] = df[column].fillna(pd.NA)
+            df[column] = df[column].map(lambda x: pd.NA if x is None else x)
+            # df[column] = df[column].fillna(pd.NA)
             df[column] = df[column].astype("timedelta64[ns]")
         for column in boolean_columns:
             df[column] = df[column].astype("boolean")

From b07f1ac8b7b9555ba05fc4a23cea112cc7d88c32 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Fri, 31 May 2024 14:00:50 +0200
Subject: [PATCH 12/67] Remove support for Python 3.8

---
 .github/workflows/test.yml | 2 --
 audformat/core/table.py    | 3 ---
 pyproject.toml             | 5 ++---
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 021def1d..2060e9bd 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -15,8 +15,6 @@ jobs:
         os: [ ubuntu-20.04, windows-latest, macOS-latest ]
         python-version: [ '3.10' ]
         include:
-          - os: ubuntu-latest
-            python-version: '3.8'
           - os: ubuntu-latest
             python-version: '3.9'
           - os: ubuntu-latest
diff --git a/audformat/core/table.py b/audformat/core/table.py
index 53ead674..fc196e55 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -914,9 +914,6 @@ def _load_csv(self, path: str):
 
         # Adjust dtypes, that cannot be handled by pyarrow
         for column in timedelta_columns:
-            # Older versions of pandas cannot convert None to timedelta
-            df[column] = df[column].map(lambda x: pd.NA if x is None else x)
-            # df[column] = df[column].fillna(pd.NA)
             df[column] = df[column].astype("timedelta64[ns]")
         for column in boolean_columns:
             df[column] = df[column].astype("boolean")
diff --git a/pyproject.toml b/pyproject.toml
index 3d497a66..14ad2128 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,13 +23,12 @@ classifiers = [
     'Operating System :: OS Independent',
     'Programming Language :: Python',
     'Programming Language :: Python :: 3',
-    'Programming Language :: Python :: 3.8',
     'Programming Language :: Python :: 3.9',
     'Programming Language :: Python :: 3.10',
     'Programming Language :: Python :: 3.11',
     'Topic :: Scientific/Engineering',
 ]
-requires-python = '>=3.8'
+requires-python = '>=3.9'
 dependencies = [
     'audeer >=2.0.0',
     'audiofile >=0.4.0',
@@ -38,7 +37,7 @@ dependencies = [
     'oyaml',
     'pyarrow',
     'pyyaml >=5.4.1',
-    'pandas >=1.4.1',
+    'pandas >=2.1.0',
 ]
 # Get version dynamically from git
 # (needs setuptools_scm tools config below)

From b1e0b69e9989680613e1563450b0c30515f3ba73 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 11 Jun 2024 12:02:17 +0200
Subject: [PATCH 13/67] Store tables as PARQUET

---
 audformat/core/define.py |   3 +
 audformat/core/table.py  | 179 +++++++++++++++++++++++++++++++++------
 tests/test_database.py   |  29 +++++--
 tests/test_table.py      | 126 +++++++++++++++++++++++----
 4 files changed, 292 insertions(+), 45 deletions(-)

diff --git a/audformat/core/define.py b/audformat/core/define.py
index 37cffa4c..addd9f79 100644
--- a/audformat/core/define.py
+++ b/audformat/core/define.py
@@ -337,6 +337,9 @@ class TableStorageFormat(DefineBase):
     CSV = "csv"
     """File extension for tables stored in CSV format."""
 
+    PARQUET = "parquet"
+    """File extension for tables stored in PARQUET format."""
+
     PICKLE = "pkl"
     """File extension for tables stored in PKL format."""
 
diff --git a/audformat/core/table.py b/audformat/core/table.py
index fc196e55..fe4e8636 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import pyarrow as pa
 import pyarrow.csv as csv
+import pyarrow.parquet as parquet
 
 import audeer
 
@@ -460,44 +461,60 @@ def load(
 
         """
         path = audeer.path(path)
-        pkl_file = f"{path}.{define.TableStorageFormat.PICKLE}"
         csv_file = f"{path}.{define.TableStorageFormat.CSV}"
+        parquet_file = f"{path}.{define.TableStorageFormat.PARQUET}"
+        pkl_file = f"{path}.{define.TableStorageFormat.PICKLE}"
 
-        if not os.path.exists(pkl_file) and not os.path.exists(csv_file):
+        if (
+            not os.path.exists(pkl_file)
+            and not os.path.exists(csv_file)
+            and not os.path.exists(parquet_file)
+        ):
             raise RuntimeError(
-                f"No file found for table with path '{path}.{{pkl|csv}}'"
+                f"No file found for table with path '{path}.{{csv|parquet|pkl}}'"
             )
 
-        # Load from PKL if file exists and is newer then CSV file.
-        # If both are written by Database.save() this is the case
+        # Load from PKL if file exists
+        # and is newer than CSV or PARQUET file.
+        # If files are written by Database.save()
+        # this is always the case
         # as it stores first the PKL file
         pickled = False
         if os.path.exists(pkl_file):
-            if os.path.exists(csv_file) and os.path.getmtime(
-                csv_file
-            ) > os.path.getmtime(pkl_file):
-                raise RuntimeError(
-                    f"The table CSV file '{csv_file}' is newer "
-                    f"than the table PKL file '{pkl_file}'. "
-                    "If you want to load from the CSV file, "
-                    "please delete the PKL file. "
-                    "If you want to load from the PKL file, "
-                    "please delete the CSV file."
-                )
+            for file in [parquet_file, csv_file]:
+                if os.path.exists(file) and os.path.getmtime(file) > os.path.getmtime(
+                    pkl_file
+                ):
+                    ext = audeer.file_extension(file).upper()
+                    raise RuntimeError(
+                        f"The table {ext} file '{file}' is newer "
+                        f"than the table PKL file '{pkl_file}'. "
+                        f"If you want to load from the {ext} file, "
+                        "please delete the PKL file. "
+                        "If you want to load from the PKL file, "
+                        f"please delete the {ext} file."
+                    )
             pickled = True
 
         if pickled:
             try:
                 self._load_pickled(pkl_file)
             except (AttributeError, ValueError, EOFError) as ex:
-                # if exception is raised (e.g. unsupported pickle protocol)
-                # try to load from CSV and save it again
+                # If exception is raised
+                # (e.g. unsupported pickle protocol)
+                # try to load from PARQUET or CSV
+                # and save it again
                 # otherwise raise error
-                if os.path.exists(csv_file):
+                if os.path.exists(parquet_file):
+                    self._load_parquet(parquet_file)
+                    self._save_pickled(pkl_file)
+                elif os.path.exists(csv_file):
                     self._load_csv(csv_file)
                     self._save_pickled(pkl_file)
                 else:
                     raise ex
+        elif os.path.exists(parquet_file):
+            self._load_parquet(parquet_file)
         else:
             self._load_csv(csv_file)
 
@@ -563,7 +580,7 @@ def save(
         self,
         path: str,
         *,
-        storage_format: str = define.TableStorageFormat.CSV,
+        storage_format: str = define.TableStorageFormat.PARQUET,
         update_other_formats: bool = True,
     ):
         r"""Save table data to disk.
@@ -583,16 +600,24 @@ def save(
         path = audeer.path(path)
         define.TableStorageFormat._assert_has_attribute_value(storage_format)
 
-        pickle_file = path + f".{define.TableStorageFormat.PICKLE}"
-        csv_file = path + f".{define.TableStorageFormat.CSV}"
+        parquet_file = f"{path}.{define.TableStorageFormat.PARQUET}"
+        pickle_file = f"{path}.{define.TableStorageFormat.PICKLE}"
+        csv_file = f"{path}.{define.TableStorageFormat.CSV}"
 
-        # Make sure the CSV file is always written first
+        # Make sure the CSV|PARQUET file is always written first
         # as it is expected to be older by load()
         if storage_format == define.TableStorageFormat.PICKLE:
-            if update_other_formats and os.path.exists(csv_file):
+            if update_other_formats and os.path.exists(parquet_file):
+                self._save_parquet(parquet_file)
+            elif update_other_formats and os.path.exists(csv_file):
                 self._save_csv(csv_file)
             self._save_pickled(pickle_file)
 
+        if storage_format == define.TableStorageFormat.PARQUET:
+            self._save_parquet(parquet_file)
+            if update_other_formats and os.path.exists(pickle_file):
+                self._save_pickled(pickle_file)
+
         if storage_format == define.TableStorageFormat.CSV:
             self._save_csv(csv_file)
             if update_other_formats and os.path.exists(pickle_file):
@@ -947,6 +972,97 @@ def _load_csv(self, path: str):
 
         self._df = df
 
+    def _load_parquet(self, path: str):
+        r"""Load table from PARQUET file.
+
+        The loaded table is stored under ``self._df``.
+
+        Args:
+            path: path to table, including file extension
+
+        """
+        schemes = self.db.schemes
+
+        # === Infer dtypes ===
+
+        # Collect columns,
+        # that cannot directly be converted
+        # from pyarrow to pandas
+        object_columns = []
+
+        # Collect columns,
+        # belonging to the index
+        index_columns = []
+
+        # --- Index ---
+        if hasattr(self, "type"):
+            levels = {}
+            # filewise or segmented table
+            levels[define.IndexField.FILE] = define.DataType.STRING
+            if self.type == define.IndexType.SEGMENTED:
+                # segmented table
+                for level in [define.IndexField.START, define.IndexField.END]:
+                    levels[level] = define.DataType.TIME
+        else:
+            # misc table
+            levels = self.levels
+        index_columns += list(levels.keys())
+        for name, dtype in levels.items():
+            if dtype == define.DataType.OBJECT:
+                object_columns.append(name)
+
+        # --- Columns ---
+        categories = {}
+        for column_id, column in self.columns.items():
+            if column.scheme_id is not None:
+                scheme = schemes[column.scheme_id]
+                if scheme.labels is not None:
+                    categories[column_id] = scheme._labels_to_list()
+                if scheme.dtype == define.DataType.OBJECT:
+                    object_columns.append(column_id)
+            else:
+                object_columns.append(column_id)
+
+        # === Read CSV ===
+        table = parquet.read_table(path)
+        df = table.to_pandas(
+            deduplicate_objects=False,
+            types_mapper={
+                pa.string(): pd.StringDtype(),
+            }.get,  # we have to provide a callable, not a dict
+        )
+
+        # === Adjust dtypes ===
+
+        # Adjust dtypes, that cannot be handled by pyarrow
+        for column in object_columns:
+            df[column] = df[column].astype("object")
+            df[column] = df[column].replace(pd.NA, None)
+        for column, labels in categories.items():
+            if len(labels) > 0 and isinstance(labels[0], int):
+                # allow nullable
+                labels = pd.array(labels, dtype="int64")
+            dtype = pd.api.types.CategoricalDtype(
+                categories=labels,
+                ordered=False,
+            )
+            df[column] = df[column].astype(dtype)
+
+        # === Set index ===
+
+        # When assigning more than one column,
+        # a MultiIndex is assigned.
+        # Setting a MultiIndex does not always preserve pandas dtypes,
+        # so we need to set them manually.
+        #
+        if len(index_columns) > 1:
+            index_dtypes = {column: df[column].dtype for column in index_columns}
+        df.set_index(index_columns, inplace=True)
+        if len(index_columns) > 1:
+            df.index = utils.set_index_dtypes(df.index, index_dtypes)
+
+        self._df = df
+
     def _load_pickled(self, path: str):
         # Older versions of audformat used xz compression
         # which produced smaller files,
@@ -976,10 +1092,23 @@ def _save_csv(self, path: str):
         # Load table before opening CSV file
         # to avoid creating a CSV file
         # that is newer than the PKL file
-        df = self.df
+        df = self.df  # loads table
         with open(path, "w") as fp:
             df.to_csv(fp, encoding="utf-8")
 
+    def _save_parquet(self, path: str):
+        # Load table before opening PARQUET file
+        # to avoid creating a PARQUET file
+        # that is newer than the PKL file
+        df = self.df  # loads table
+        table = pa.Table.from_pandas(
+            df.reset_index(),
+            preserve_index=False,
+            # TODO: check if faster when providing schema?
+            # schema=self._schema,
+        )
+        parquet.write_table(table, path)
+
     def _save_pickled(self, path: str):
         self.df.to_pickle(
             path,
diff --git a/tests/test_database.py b/tests/test_database.py
index dee4e658..67dfa2cf 100644
--- a/tests/test_database.py
+++ b/tests/test_database.py
@@ -446,6 +446,12 @@ def test_map_files(num_workers):
 @pytest.mark.parametrize(
     "db, storage_format, load_data, num_workers",
     [
+        (
+            audformat.testing.create_db(minimal=True),
+            audformat.define.TableStorageFormat.PARQUET,
+            False,
+            1,
+        ),
         (
             audformat.testing.create_db(minimal=True),
             audformat.define.TableStorageFormat.CSV,
@@ -458,6 +464,12 @@ def test_map_files(num_workers):
             False,
             1,
         ),
+        (
+            audformat.testing.create_db(),
+            audformat.define.TableStorageFormat.PARQUET,
+            False,
+            4,
+        ),
         (
             audformat.testing.create_db(),
             audformat.define.TableStorageFormat.CSV,
@@ -479,6 +491,11 @@ def test_map_files(num_workers):
     ],
 )
 def test_save_and_load(tmpdir, db, storage_format, load_data, num_workers):
+    all_formats = audformat.define.TableStorageFormat._attribute_values()
+    non_cache_formats = [
+        ext for ext in all_formats if ext != audformat.define.TableStorageFormat.PICKLE
+    ]
+
     assert db.root is None
     audformat.testing.create_attachment_files(db, tmpdir)
     db.save(
@@ -490,7 +507,7 @@ def test_save_and_load(tmpdir, db, storage_format, load_data, num_workers):
 
     expected_formats = [storage_format]
     for table_id in db.tables:
-        for ext in audformat.define.TableStorageFormat._attribute_values():
+        for ext in all_formats:
             table_file = os.path.join(tmpdir, f"db.{table_id}.{ext}")
             if ext in expected_formats:
                 assert os.path.exists(table_file)
@@ -498,7 +515,7 @@ def test_save_and_load(tmpdir, db, storage_format, load_data, num_workers):
                 assert not os.path.exists(table_file)
 
     # Test update other formats
-    if storage_format == audformat.define.TableStorageFormat.CSV and db.tables:
+    if storage_format in non_cache_formats and db.tables:
         db2 = audformat.testing.create_db()
         assert db2.root is None
         db2.save(
@@ -508,7 +525,7 @@ def test_save_and_load(tmpdir, db, storage_format, load_data, num_workers):
         )
         assert db.root == tmpdir
 
-        # Load prefers PKL files over CSV files,
+        # Load prefers PKL files,
         # which means we are loading the second database here
         db_load = audformat.Database.load(
             tmpdir,
@@ -621,14 +638,16 @@ def test_save_and_load(tmpdir, db, storage_format, load_data, num_workers):
     # Test missing table
     if db.tables:
         table_id = list(db.tables)[0]
-        for ext in audformat.define.TableStorageFormat._attribute_values():
+        for ext in all_formats:
             table_file = os.path.join(tmpdir, f"db.{table_id}.{ext}")
             if os.path.exists(table_file):
                 os.remove(table_file)
 
         # The replace part handles Windows paths
         table_path = table_file[:-4].replace("\\", "\\\\")
-        error_msg = r"No file found for table with path " rf"'{table_path}.{{pkl|csv}}'"
+        error_msg = (
+            r"No file found for table with path " rf"'{table_path}.{{csv|parquet|pkl}}'"
+        )
         with pytest.raises(RuntimeError, match=error_msg):
             db = audformat.Database.load(
                 tmpdir,
diff --git a/tests/test_table.py b/tests/test_table.py
index 348e455c..d4ece3d9 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1,4 +1,5 @@
 import os
+import re
 import typing
 
 import numpy as np
@@ -1118,22 +1119,25 @@ def test_load(tmpdir):
     with pytest.raises(EOFError):
         table_loaded.load(path_no_ext)
 
-    # repeat with CSV file as fall back
-    table.save(
-        path_no_ext,
-        storage_format=audformat.define.TableStorageFormat.CSV,
-    )
-    with open(path_pkl, "wb"):
-        pass
-    table_loaded = audformat.Table()
-    table_loaded.columns = table.columns
-    table_loaded._db = table._db
-    table_loaded.load(path_no_ext)
-    pd.testing.assert_frame_equal(table.df, table_loaded.df)
+    # repeat with CSV|PARQUET file as fall back
+    for ext in [
+        audformat.define.TableStorageFormat.CSV,
+        audformat.define.TableStorageFormat.PARQUET,
+    ]:
+        table.save(path_no_ext, storage_format=ext)
+        with open(path_pkl, "wb"):
+            pass
+        table_loaded = audformat.Table()
+        table_loaded.columns = table.columns
+        table_loaded._db = table._db
+        table_loaded.load(path_no_ext)
+        pd.testing.assert_frame_equal(table.df, table_loaded.df)
 
-    # check if pickle file was recovered from CSV
-    df = pd.read_pickle(path_pkl)
-    pd.testing.assert_frame_equal(table.df, df)
+        # check if pickle file was recovered
+        df = pd.read_pickle(path_pkl)
+        pd.testing.assert_frame_equal(table.df, df)
+
+        os.remove(f"{path_no_ext}.{ext}")
 
 
 def test_load_old_pickle(tmpdir):
@@ -1403,6 +1407,98 @@ def test_pick_index(table, index, expected):
     pd.testing.assert_index_equal(table.index, expected)
 
 
+@pytest.mark.parametrize(
+    "storage_format",
+    [
+        pytest.param(
+            "csv",
+            marks=pytest.mark.skip(reason="CSV does not support numpy arrays"),
+        ),
+        "parquet",
+        "pkl",
+    ],
+)
+def test_save_and_load(tmpdir, storage_format):
+    r"""Test saving and loading of a table.
+
+    Ensures the table dataframe representation
+    is identical after saving and loading a table.
+
+    Args:
+        tmpdir: tmpdir fixture
+        storage_format: storage format
+            the table should be written to disk.
+            This will also be used as file extension
+
+    """
+    db = audformat.testing.create_db()
+
+    # Extend database with more table/scheme combinations
+    db.schemes["int-labels"] = audformat.Scheme(
+        dtype=audformat.define.DataType.INTEGER,
+        labels=[0, 1],
+    )
+    db.schemes["object"] = audformat.Scheme(audformat.define.DataType.OBJECT)
+    index = pd.MultiIndex.from_arrays(
+        [[0, 1], ["a", "b"]],
+        names=["idx1", "idx2"],
+    )
+    index = audformat.utils.set_index_dtypes(
+        index,
+        {
+            "idx1": audformat.define.DataType.INTEGER,
+            "idx2": audformat.define.DataType.OBJECT,
+        },
+    )
+    db["multi-misc"] = audformat.MiscTable(index)
+    db["multi-misc"]["int"] = audformat.Column(scheme_id="int-labels")
+    db["multi-misc"]["int"].set([0, pd.NA])
+    db["multi-misc"]["bool"] = audformat.Column(scheme_id="bool")
+    db["multi-misc"]["bool"].set([True, pd.NA])
+    db["multi-misc"]["arrays"] = audformat.Column(scheme_id="object")
+    db["multi-misc"]["arrays"].set([np.array([0, 1]), np.array([2, 3])])
+    db["multi-misc"]["lists"] = audformat.Column(scheme_id="object")
+    db["multi-misc"]["lists"].set([[0, 1], [2, 3]])
+
+    for table_id in list(db):
+        expected_df = db[table_id].get()
+        path_wo_ext = audeer.path(tmpdir, table_id)
+        path = f"{path_wo_ext}.{storage_format}"
+        db[table_id].save(path_wo_ext, storage_format=storage_format)
+        assert os.path.exists(path)
+        db[table_id].load(path_wo_ext)
+        pd.testing.assert_frame_equal(db[table_id].df, expected_df)
+
+
+@pytest.mark.parametrize(
+    "storage_format, expected_error, expected_error_msg",
+    [
+        (
+            "non-existing",
+            audformat.errors.BadValueError,
+            re.escape(
+                "Bad value 'non-existing', expected one of ['csv', 'parquet', 'pkl']"
+            ),
+        ),
+    ],
+)
+def test_save_errors(tmpdir, storage_format, expected_error, expected_error_msg):
+    r"""Test errors when saving a table.
+
+    Args:
+        tmpdir: tmpdir fixture
+        storage_format: storage format of table
+        expected_error: expected error, e.g. ``ValueError``
+        expected_error_msg: expected test of error message
+
+    """
+    db = audformat.testing.create_db()
+    table_id = list(db)[0]
+    path_wo_ext = audeer.path(tmpdir, table_id)
+    with pytest.raises(expected_error, match=expected_error_msg):
+        db[table_id].save(path_wo_ext, storage_format=storage_format)
+
+
 @pytest.mark.parametrize(
     "num_files,num_segments_per_file,values",
     [

From 68c764cc8f84ceb7aaa829bd81e7f4e94856f110 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 11 Jun 2024 14:37:41 +0200
Subject: [PATCH 14/67] Cleanup code + Table.levels

---
 audformat/core/database.py |   1 +
 audformat/core/table.py    | 361 ++++++++++++++++++-------------------
 2 files changed, 174 insertions(+), 188 deletions(-)

diff --git a/audformat/core/database.py b/audformat/core/database.py
index 5eb72e68..8c26bedf 100644
--- a/audformat/core/database.py
+++ b/audformat/core/database.py
@@ -121,6 +121,7 @@ class Database(HeaderBase):
         tables:
           table:
             type: filewise
+            levels: {file: str}
             media_id: audio
             columns:
               column: {scheme_id: emotion, rater_id: rater}
diff --git a/audformat/core/table.py b/audformat/core/table.py
index fe4e8636..a3091494 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -819,6 +819,102 @@ def assert_equal(
 
         return self
 
+    def _convert_pyarrow_dtypes(
+        self,
+        df: pd.DataFrame,
+        *,
+        convert_all: bool = False,
+    ) -> pd.DataFrame:
+        r"""Convert dtypes that are not handled by pyarrow.
+
+        This adjusts dtypes in a dataframe,
+        that could not be set correctly
+        when converting to the dataframe
+        from pyarrow.
+
+        Args:
+            df: dataframe,
+            convert_all: if ``False``,
+                converts all columns with
+                ``"object"`` audformat dtype,
+                and all columns with a scheme with labels.
+                If ``"True"``,
+                it converts additionally all columns with
+                ``"bool"``, ``"int"``, and ``"time"`` audformat dtypes
+
+        Returns:
+            dataframe with converted dtypes
+
+        """
+        # Collect columns with dtypes,
+        # that cannot directly be converted
+        # from pyarrow to pandas
+        bool_columns = []
+        int_columns = []
+        time_columns = []
+        object_columns = []
+
+        # Collect columns
+        # with scheme labels
+        labeled_columns = []
+
+        # Collect columns,
+        # belonging to the index
+        index_columns = []
+
+        # --- Index ---
+        index_columns += list(self.levels.keys())
+        for level, dtype in self.levels.items():
+            if dtype == define.DataType.BOOL:
+                bool_columns.append(level)
+            elif dtype == define.DataType.INTEGER:
+                int_columns.append(level)
+            elif dtype == define.DataType.TIME:
+                time_columns.append(level)
+            elif dtype == define.DataType.OBJECT:
+                object_columns.append(level)
+
+        # --- Columns ---
+        for column_id, column in self.columns.items():
+            if column.scheme_id is not None:
+                scheme = self.db.schemes[column.scheme_id]
+                if scheme.labels is not None:
+                    labeled_columns.append(column_id)
+                elif scheme.dtype == define.DataType.BOOL:
+                    bool_columns.append(column_id)
+                elif scheme.dtype == define.DataType.INTEGER:
+                    int_columns.append(column_id)
+                elif scheme.dtype == define.DataType.TIME:
+                    time_columns.append(column_id)
+                elif scheme.dtype == define.DataType.OBJECT:
+                    object_columns.append(column_id)
+            else:
+                # No scheme defaults to `object` dtype
+                object_columns.append(column_id)
+
+        if convert_all:
+            for column in bool_columns:
+                df[column] = df[column].astype("boolean")
+            for column in int_columns:
+                df[column] = df[column].astype("Int64")
+            for column in time_columns:
+                df[column] = df[column].astype("timedelta64[ns]")
+        for column in object_columns:
+            df[column] = df[column].astype("object")
+            df[column] = df[column].replace(pd.NA, None)
+        for column in labeled_columns:
+            scheme = self.db.schemes[self.columns[column].scheme_id]
+            labels = scheme._labels_to_list()
+            if len(labels) > 0 and isinstance(labels[0], int):
+                # allow nullable
+                labels = pd.array(labels, dtype="int64")
+            dtype = pd.api.types.CategoricalDtype(
+                categories=labels,
+                ordered=False,
+            )
+            df[column] = df[column].astype(dtype)
+        return df
+
     def _get_by_index(
         self,
         index: pd.Index,
@@ -841,134 +937,43 @@ def _load_csv(self, path: str):
             path: path to table, including file extension
 
         """
-        schemes = self.db.schemes
-
-        # === Infer dtypes ===
-
         # Collect pyarrow dtypes
-        # of the CSV file,
-        # by inspecting the audformat schemes,
-        # and the index
-        # associated with the table.
-        # The dtypes are used to create
+        # of all columns,
+        # including index columns.
+        # The dtypes are stored as a tuple
+        # ``(column, dtype)``,
+        # and are used to create
         # the pyarrow.Schema
         # used when reading the CSV file
         pyarrow_dtypes = []
-
-        # Collect columns,
-        # that cannot directly be converted
-        # from pyarrow to pandas
-        timedelta_columns = []
-        boolean_columns = []
-        object_columns = []
-        integer_columns = []
-
-        # Collect columns,
-        # belonging to the index
-        index_columns = []
-
-        # --- Index ---
-        if hasattr(self, "type"):
-            levels = {}
-            # filewise or segmented table
-            levels[define.IndexField.FILE] = define.DataType.STRING
-            if self.type == define.IndexType.SEGMENTED:
-                # segmented table
-                for level in [define.IndexField.START, define.IndexField.END]:
-                    levels[level] = define.DataType.TIME
-        else:
-            # misc table
-            levels = self.levels
-        index_columns += list(levels.keys())
-        for name, dtype in levels.items():
-            pyarrow_dtype = to_pyarrow_dtype(dtype)
-            if pyarrow_dtype is not None:
-                pyarrow_dtypes.append((name, pyarrow_dtype))
-                if dtype == define.DataType.TIME:
-                    timedelta_columns.append(name)
-                elif dtype == define.DataType.INTEGER:
-                    integer_columns.append(name)
-                elif dtype == define.DataType.BOOL:
-                    boolean_columns.append(name)
-            else:
-                object_columns.append(name)
-
-        # --- Columns ---
-        categories = {}
-        columns = list(self.columns)
+        # Index
+        for level, dtype in self.levels.items():
+            if dtype != define.DataType.OBJECT:
+                pyarrow_dtypes.append((level, to_pyarrow_dtype(dtype)))
+        # Columns
         for column_id, column in self.columns.items():
             if column.scheme_id is not None:
-                scheme = schemes[column.scheme_id]
-                if scheme.labels is not None:
-                    categories[column_id] = scheme._labels_to_list()
-                pyarrow_dtype = to_pyarrow_dtype(scheme.dtype)
-                if pyarrow_dtype is not None:
-                    pyarrow_dtypes.append((column_id, pyarrow_dtype))
-                    if scheme.dtype == define.DataType.TIME:
-                        timedelta_columns.append(column_id)
-                    elif scheme.dtype == define.DataType.BOOL:
-                        boolean_columns.append(column_id)
-                    elif scheme.dtype == define.DataType.INTEGER:
-                        integer_columns.append(column_id)
-                else:
-                    object_columns.append(column_id)
-            else:
-                object_columns.append(column_id)
+                dtype = self.db.schemes[column.scheme_id].dtype
+                if dtype != define.DataType.OBJECT:
+                    pyarrow_dtypes.append((column_id, to_pyarrow_dtype(dtype)))
 
-        # === Read CSV ===
-        schema = pa.schema(pyarrow_dtypes)
+        # Read CSV file
         table = csv.read_csv(
             path,
             read_options=csv.ReadOptions(
-                column_names=index_columns + columns,
+                column_names=list(self.levels.keys()) + list(self.columns.keys()),
                 skip_rows=1,
             ),
             convert_options=csv.ConvertOptions(
-                column_types=schema,
+                column_types=pa.schema(pyarrow_dtypes),
                 strings_can_be_null=True,
             ),
         )
-        df = table.to_pandas(
-            deduplicate_objects=False,
-            types_mapper={
-                pa.string(): pd.StringDtype(),
-            }.get,  # we have to provide a callable, not a dict
-        )
-
-        # === Adjust dtypes ===
-
-        # Adjust dtypes, that cannot be handled by pyarrow
-        for column in timedelta_columns:
-            df[column] = df[column].astype("timedelta64[ns]")
-        for column in boolean_columns:
-            df[column] = df[column].astype("boolean")
-        for column in object_columns:
-            df[column] = df[column].astype("object")
-            df[column] = df[column].replace(pd.NA, None)
-        for column in integer_columns:
-            df[column] = df[column].astype("Int64")
-        for column, labels in categories.items():
-            if len(labels) > 0 and isinstance(labels[0], int):
-                # allow nullable
-                labels = pd.array(labels, dtype="int64")
-            dtype = pd.api.types.CategoricalDtype(
-                categories=labels,
-                ordered=False,
-            )
-            df[column] = df[column].astype(dtype)
+        df = self._pyarrow_table_to_dataframe(table)
 
-        # === Set index ===
-
-        # When assigning more than one column,
-        # a MultiIndex is assigned.
-        # Setting a MultiIndex does not always preserve pandas dtypes,
-        # so we need to set them manually.
-        #
-        if len(index_columns) > 1:
-            index_dtypes = {column: df[column].dtype for column in index_columns}
-        df.set_index(index_columns, inplace=True)
-        if len(index_columns) > 1:
-            df.index = utils.set_index_dtypes(df.index, index_dtypes)
+        # Adjust dtypes and set index
+        df = self._convert_pyarrow_dtypes(df, convert_all=True)
+        df = self._set_index(df, list(self.levels.keys()))
 
         self._df = df
 
@@ -981,85 +986,13 @@ def _load_parquet(self, path: str):
             path: path to table, including file extension
 
         """
-        schemes = self.db.schemes
-
-        # === Infer dtypes ===
-
-        # Collect columns,
-        # that cannot directly be converted
-        # from pyarrow to pandas
-        object_columns = []
-
-        # Collect columns,
-        # belonging to the index
-        index_columns = []
-
-        # --- Index ---
-        if hasattr(self, "type"):
-            levels = {}
-            # filewise or segmented table
-            levels[define.IndexField.FILE] = define.DataType.STRING
-            if self.type == define.IndexType.SEGMENTED:
-                # segmented table
-                for level in [define.IndexField.START, define.IndexField.END]:
-                    levels[level] = define.DataType.TIME
-        else:
-            # misc table
-            levels = self.levels
-        index_columns += list(levels.keys())
-        for name, dtype in levels.items():
-            if dtype == define.DataType.OBJECT:
-                object_columns.append(name)
-
-        # --- Columns ---
-        categories = {}
-        for column_id, column in self.columns.items():
-            if column.scheme_id is not None:
-                scheme = schemes[column.scheme_id]
-                if scheme.labels is not None:
-                    categories[column_id] = scheme._labels_to_list()
-                if scheme.dtype == define.DataType.OBJECT:
-                    object_columns.append(column_id)
-            else:
-                object_columns.append(column_id)
-
-        # === Read CSV ===
+        # Read PARQUET file
         table = parquet.read_table(path)
-        df = table.to_pandas(
-            deduplicate_objects=False,
-            types_mapper={
-                pa.string(): pd.StringDtype(),
-            }.get,  # we have to provide a callable, not a dict
-        )
-
-        # === Adjust dtypes ===
-
-        # Adjust dtypes, that cannot be handled by pyarrow
-        for column in object_columns:
-            df[column] = df[column].astype("object")
-            df[column] = df[column].replace(pd.NA, None)
-        for column, labels in categories.items():
-            if len(labels) > 0 and isinstance(labels[0], int):
-                # allow nullable
-                labels = pd.array(labels, dtype="int64")
-            dtype = pd.api.types.CategoricalDtype(
-                categories=labels,
-                ordered=False,
-            )
-            df[column] = df[column].astype(dtype)
-
-        # === Set index ===
+        df = self._pyarrow_table_to_dataframe(table)
 
-        # When assigning more than one column,
-        # a MultiIndex is assigned.
-        # Setting a MultiIndex does not always preserve pandas dtypes,
-        # so we need to set them manually.
-        #
-        if len(index_columns) > 1:
-            index_dtypes = {column: df[column].dtype for column in index_columns}
-        df.set_index(index_columns, inplace=True)
-        if len(index_columns) > 1:
-            df.index = utils.set_index_dtypes(df.index, index_dtypes)
+        # Adjust dtypes and set index
+        df = self._convert_pyarrow_dtypes(df)
+        df = self._set_index(df, list(self.levels.keys()))
 
         self._df = df
 
@@ -1088,6 +1021,23 @@ def _load_pickled(self, path: str):
 
         self._df = df
 
+    def _pyarrow_table_to_dataframe(self, table: pa.Table) -> pd.DataFrame:
+        r"""Convert pyarrow table to pandas dataframe.
+
+        Args:
+            table: pyarrow table
+
+        Returns:
+            dataframe
+
+        """
+        return table.to_pandas(
+            deduplicate_objects=False,
+            types_mapper={
+                pa.string(): pd.StringDtype(),
+            }.get,  # we have to provide a callable, not a dict
+        )
+
     def _save_csv(self, path: str):
         # Load table before opening CSV file
         # to avoid creating a CSV file
@@ -1149,6 +1099,31 @@ def _set_column(self, column_id: str, column: Column) -> Column:
 
         return column
 
+    def _set_index(self, df: pd.DataFrame, columns: typing.Sequence) -> pd.DataFrame:
+        r"""Set columns as index.
+
+        Setting of index columns is performed inplace!
+
+        Args:
+            df: dataframe
+            columns: columns to be set as index of dataframe
+
+        Returns:
+            updated dataframe
+
+        """
+        # When assigning more than one column,
+        # a MultiIndex is assigned.
+        # Setting a MultiIndex does not always preserve pandas dtypes,
+        # so we need to set them manually.
+        #
+        if len(columns) > 1:
+            dtypes = {column: df[column].dtype for column in columns}
+        df.set_index(columns, inplace=True)
+        if len(columns) > 1:
+            df.index = utils.set_index_dtypes(df.index, dtypes)
+        return df
+
 
 class MiscTable(Base):
     r"""Miscellaneous table.
@@ -1348,6 +1323,7 @@ class Table(Base):
         >>> table["values"] = Column()
         >>> table
         type: filewise
+        levels: {file: str}
         split_id: test
         columns:
           values: {}
@@ -1439,6 +1415,15 @@ def __init__(
 
         """
 
+        levels = {}
+        levels[define.IndexField.FILE] = define.DataType.STRING
+        if self.type == define.IndexType.SEGMENTED:
+            levels[define.IndexField.START] = define.DataType.TIME
+            levels[define.IndexField.END] = define.DataType.TIME
+
+        self.levels = levels
+        r"""Index levels."""
+
         super().__init__(
             index,
             split_id=split_id,

From fdc96bdb11e00692fa8ea0bd68a43f1c95bd5b61 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 11 Jun 2024 14:44:34 +0200
Subject: [PATCH 15/67] Use dict for CSV dtype mappings

---
 audformat/core/common.py | 34 ----------------------------------
 audformat/core/table.py  | 21 ++++++++++++++++-----
 2 files changed, 16 insertions(+), 39 deletions(-)

diff --git a/audformat/core/common.py b/audformat/core/common.py
index 5888fea9..833b44fc 100644
--- a/audformat/core/common.py
+++ b/audformat/core/common.py
@@ -6,7 +6,6 @@
 
 import oyaml as yaml
 import pandas as pd
-import pyarrow as pa
 
 from audformat import define
 from audformat.core.errors import BadKeyError
@@ -389,36 +388,3 @@ def to_pandas_dtype(dtype: str) -> typing.Optional[str]:
         return "string"
     elif dtype == define.DataType.TIME:
         return "timedelta64[ns]"
-
-
-def to_pyarrow_dtype(dtype: str) -> typing.Optional[str]:
-    r"""Convert audformat to pyarrow dtype.
-
-    For ``"object"`` as ``dtype``
-    there is no equivalent,
-    and we don't return a value here.
-    We let ``pyarrow`` decide,
-    which dtype fits best in that case.
-
-    Args:
-        dtype: audformat dtype
-
-    Returns:
-        pyarrow dtype
-
-    """
-    if dtype == define.DataType.BOOL:
-        return pa.bool_()
-    elif dtype == define.DataType.DATE:
-        return pa.timestamp("ns")
-    elif dtype == define.DataType.FLOAT:
-        return pa.float64()
-    elif dtype == define.DataType.INTEGER:
-        return pa.int64()
-    elif dtype == define.DataType.STRING:
-        return pa.string()
-    elif dtype == define.DataType.TIME:
-        # A better fitting type would be `pa.duration("ns")`,
-        # but this is not yet supported
-        # when reading CSV files
-        return pa.string()
diff --git a/audformat/core/table.py b/audformat/core/table.py
index a3091494..125eb921 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -18,7 +18,6 @@
 from audformat.core.common import HeaderBase
 from audformat.core.common import HeaderDict
 from audformat.core.common import to_audformat_dtype
-from audformat.core.common import to_pyarrow_dtype
 from audformat.core.errors import BadIdError
 from audformat.core.index import filewise_index
 from audformat.core.index import index_type
@@ -946,16 +945,28 @@ def _load_csv(self, path: str):
         # the pyarrow.Schema
         # used when reading the CSV file
         pyarrow_dtypes = []
+        # Mapping from audformat to pyarrow dtypes
+        to_pyarrow_dtype = {
+            define.DataType.BOOL: pa.bool_(),
+            define.DataType.DATE: pa.timestamp("ns"),
+            define.DataType.FLOAT: pa.float64(),
+            define.DataType.INTEGER: pa.int64(),
+            define.DataType.STRING: pa.string(),
+            # A better fitting type would be `pa.duration("ns")`,
+            # but this is not yet supported
+            # when reading CSV files
+            define.DataType.TIME: pa.string(),
+        }
         # Index
         for level, dtype in self.levels.items():
-            if dtype != define.DataType.OBJECT:
-                pyarrow_dtypes.append((level, to_pyarrow_dtype(dtype)))
+            if dtype in to_pyarrow_dtype:
+                pyarrow_dtypes.append((level, to_pyarrow_dtype[dtype]))
         # Columns
         for column_id, column in self.columns.items():
             if column.scheme_id is not None:
                 dtype = self.db.schemes[column.scheme_id].dtype
-                if dtype != define.DataType.OBJECT:
-                    pyarrow_dtypes.append((column_id, to_pyarrow_dtype(dtype)))
+                if dtype in to_pyarrow_dtype:
+                    pyarrow_dtypes.append((column_id, to_pyarrow_dtype[dtype]))
 
         # Read CSV file
         table = csv.read_csv(

From e865813c7bef449b90022985f0d948da60a40dad Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 11 Jun 2024 15:19:15 +0200
Subject: [PATCH 16/67] Rename helper function

---
 audformat/core/table.py | 196 ++++++++++++++++++++--------------------
 1 file changed, 98 insertions(+), 98 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 125eb921..610cd0a3 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -818,102 +818,6 @@ def assert_equal(
 
         return self
 
-    def _convert_pyarrow_dtypes(
-        self,
-        df: pd.DataFrame,
-        *,
-        convert_all: bool = False,
-    ) -> pd.DataFrame:
-        r"""Convert dtypes that are not handled by pyarrow.
-
-        This adjusts dtypes in a dataframe,
-        that could not be set correctly
-        when converting to the dataframe
-        from pyarrow.
-
-        Args:
-            df: dataframe,
-            convert_all: if ``False``,
-                converts all columns with
-                ``"object"`` audformat dtype,
-                and all columns with a scheme with labels.
-                If ``"True"``,
-                it converts additionally all columns with
-                ``"bool"``, ``"int"``, and ``"time"`` audformat dtypes
-
-        Returns:
-            dataframe with converted dtypes
-
-        """
-        # Collect columns with dtypes,
-        # that cannot directly be converted
-        # from pyarrow to pandas
-        bool_columns = []
-        int_columns = []
-        time_columns = []
-        object_columns = []
-
-        # Collect columns
-        # with scheme labels
-        labeled_columns = []
-
-        # Collect columns,
-        # belonging to the index
-        index_columns = []
-
-        # --- Index ---
-        index_columns += list(self.levels.keys())
-        for level, dtype in self.levels.items():
-            if dtype == define.DataType.BOOL:
-                bool_columns.append(level)
-            elif dtype == define.DataType.INTEGER:
-                int_columns.append(level)
-            elif dtype == define.DataType.TIME:
-                time_columns.append(level)
-            elif dtype == define.DataType.OBJECT:
-                object_columns.append(level)
-
-        # --- Columns ---
-        for column_id, column in self.columns.items():
-            if column.scheme_id is not None:
-                scheme = self.db.schemes[column.scheme_id]
-                if scheme.labels is not None:
-                    labeled_columns.append(column_id)
-                elif scheme.dtype == define.DataType.BOOL:
-                    bool_columns.append(column_id)
-                elif scheme.dtype == define.DataType.INTEGER:
-                    int_columns.append(column_id)
-                elif scheme.dtype == define.DataType.TIME:
-                    time_columns.append(column_id)
-                elif scheme.dtype == define.DataType.OBJECT:
-                    object_columns.append(column_id)
-            else:
-                # No scheme defaults to `object` dtype
-                object_columns.append(column_id)
-
-        if convert_all:
-            for column in bool_columns:
-                df[column] = df[column].astype("boolean")
-            for column in int_columns:
-                df[column] = df[column].astype("Int64")
-            for column in time_columns:
-                df[column] = df[column].astype("timedelta64[ns]")
-        for column in object_columns:
-            df[column] = df[column].astype("object")
-            df[column] = df[column].replace(pd.NA, None)
-        for column in labeled_columns:
-            scheme = self.db.schemes[self.columns[column].scheme_id]
-            labels = scheme._labels_to_list()
-            if len(labels) > 0 and isinstance(labels[0], int):
-                # allow nullable
-                labels = pd.array(labels, dtype="int64")
-            dtype = pd.api.types.CategoricalDtype(
-                categories=labels,
-                ordered=False,
-            )
-            df[column] = df[column].astype(dtype)
-        return df
-
     def _get_by_index(
         self,
         index: pd.Index,
@@ -983,7 +887,7 @@ def _load_csv(self, path: str):
         df = self._pyarrow_table_to_dataframe(table)
 
         # Adjust dtypes and set index
-        df = self._convert_pyarrow_dtypes(df, convert_all=True)
+        df = self._pyarrow_convert_dtypes(df, convert_all=True)
         df = self._set_index(df, list(self.levels.keys()))
 
         self._df = df
@@ -1002,7 +906,7 @@ def _load_parquet(self, path: str):
         df = self._pyarrow_table_to_dataframe(table)
 
         # Adjust dtypes and set index
-        df = self._convert_pyarrow_dtypes(df)
+        df = self._pyarrow_convert_dtypes(df)
         df = self._set_index(df, list(self.levels.keys()))
 
         self._df = df
@@ -1032,6 +936,102 @@ def _load_pickled(self, path: str):
 
         self._df = df
 
+    def _pyarrow_convert_dtypes(
+        self,
+        df: pd.DataFrame,
+        *,
+        convert_all: bool = False,
+    ) -> pd.DataFrame:
+        r"""Convert dtypes that are not handled by pyarrow.
+
+        This adjusts dtypes in a dataframe,
+        that could not be set correctly
+        when converting to the dataframe
+        from pyarrow.
+
+        Args:
+            df: dataframe,
+            convert_all: if ``False``,
+                converts all columns with
+                ``"object"`` audformat dtype,
+                and all columns with a scheme with labels.
+                If ``"True"``,
+                it converts additionally all columns with
+                ``"bool"``, ``"int"``, and ``"time"`` audformat dtypes
+
+        Returns:
+            dataframe with converted dtypes
+
+        """
+        # Collect columns with dtypes,
+        # that cannot directly be converted
+        # from pyarrow to pandas
+        bool_columns = []
+        int_columns = []
+        time_columns = []
+        object_columns = []
+
+        # Collect columns
+        # with scheme labels
+        labeled_columns = []
+
+        # Collect columns,
+        # belonging to the index
+        index_columns = []
+
+        # --- Index ---
+        index_columns += list(self.levels.keys())
+        for level, dtype in self.levels.items():
+            if dtype == define.DataType.BOOL:
+                bool_columns.append(level)
+            elif dtype == define.DataType.INTEGER:
+                int_columns.append(level)
+            elif dtype == define.DataType.TIME:
+                time_columns.append(level)
+            elif dtype == define.DataType.OBJECT:
+                object_columns.append(level)
+
+        # --- Columns ---
+        for column_id, column in self.columns.items():
+            if column.scheme_id is not None:
+                scheme = self.db.schemes[column.scheme_id]
+                if scheme.labels is not None:
+                    labeled_columns.append(column_id)
+                elif scheme.dtype == define.DataType.BOOL:
+                    bool_columns.append(column_id)
+                elif scheme.dtype == define.DataType.INTEGER:
+                    int_columns.append(column_id)
+                elif scheme.dtype == define.DataType.TIME:
+                    time_columns.append(column_id)
+                elif scheme.dtype == define.DataType.OBJECT:
+                    object_columns.append(column_id)
+            else:
+                # No scheme defaults to `object` dtype
+                object_columns.append(column_id)
+
+        if convert_all:
+            for column in bool_columns:
+                df[column] = df[column].astype("boolean")
+            for column in int_columns:
+                df[column] = df[column].astype("Int64")
+            for column in time_columns:
+                df[column] = df[column].astype("timedelta64[ns]")
+        for column in object_columns:
+            df[column] = df[column].astype("object")
+            df[column] = df[column].replace(pd.NA, None)
+        for column in labeled_columns:
+            scheme = self.db.schemes[self.columns[column].scheme_id]
+            labels = scheme._labels_to_list()
+            if len(labels) > 0 and isinstance(labels[0], int):
+                # allow nullable
+                labels = pd.array(labels, dtype="int64")
+            dtype = pd.api.types.CategoricalDtype(
+                categories=labels,
+                ordered=False,
+            )
+            df[column] = df[column].astype(dtype)
+        return df
+
     def _pyarrow_table_to_dataframe(self, table: pa.Table) -> pd.DataFrame:
         r"""Convert pyarrow table to pandas dataframe.
 

From eee02d3721fb51fb4ba6560fa174d89f471b01a0 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 11 Jun 2024 15:21:59 +0200
Subject: [PATCH 17/67] Simplify code

---
 audformat/core/table.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 610cd0a3..b74ec5d3 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -884,11 +884,7 @@ def _load_csv(self, path: str):
                 strings_can_be_null=True,
             ),
         )
-        df = self._pyarrow_table_to_dataframe(table)
-
-        # Adjust dtypes and set index
-        df = self._pyarrow_convert_dtypes(df, convert_all=True)
-        df = self._set_index(df, list(self.levels.keys()))
+        df = self._pyarrow_table_to_dataframe(table, from_csv=True)
 
         self._df = df
 
@@ -905,10 +901,6 @@ def _load_parquet(self, path: str):
         table = parquet.read_table(path)
         df = self._pyarrow_table_to_dataframe(table)
 
-        # Adjust dtypes and set index
-        df = self._pyarrow_convert_dtypes(df)
-        df = self._set_index(df, list(self.levels.keys()))
-
         self._df = df
 
     def _load_pickled(self, path: str):
@@ -1032,22 +1024,34 @@ def _pyarrow_convert_dtypes(
             df[column] = df[column].astype(dtype)
         return df
 
-    def _pyarrow_table_to_dataframe(self, table: pa.Table) -> pd.DataFrame:
+    def _pyarrow_table_to_dataframe(
+        self,
+        table: pa.Table,
+        *,
+        from_csv: bool = False,
+    ) -> pd.DataFrame:
         r"""Convert pyarrow table to pandas dataframe.
 
         Args:
             table: pyarrow table
+            from_csv: if ``True`` it assumes,
+                that ``table`` was created by reading a CSV file,
+                and it will convert all needed dtypes
 
         Returns:
             dataframe
 
         """
-        return table.to_pandas(
+        df = table.to_pandas(
             deduplicate_objects=False,
             types_mapper={
                 pa.string(): pd.StringDtype(),
             }.get,  # we have to provide a callable, not a dict
         )
+        # Adjust dtypes and set index
+        df = self._pyarrow_convert_dtypes(df, convert_all=from_csv)
+        df = self._set_index(df, list(self.levels.keys()))
+        return df
 
     def _save_csv(self, path: str):
         # Load table before opening CSV file

From cb4a42fd04168a59e9d4036f198d32918f507638 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 11 Jun 2024 15:27:57 +0200
Subject: [PATCH 18/67] Add helper function for CSV schema

---
 audformat/core/table.py | 84 ++++++++++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 34 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index b74ec5d3..10927505 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -840,39 +840,6 @@ def _load_csv(self, path: str):
             path: path to table, including file extension
 
         """
-        # Collect pyarrow dtypes
-        # of all columns,
-        # including index columns.
-        # The dtypes are stored as a tuple
-        # ``(column, dtype)``,
-        # and are used to create
-        # the pyarrow.Schema
-        # used when reading the CSV file
-        pyarrow_dtypes = []
-        # Mapping from audformat to pyarrow dtypes
-        to_pyarrow_dtype = {
-            define.DataType.BOOL: pa.bool_(),
-            define.DataType.DATE: pa.timestamp("ns"),
-            define.DataType.FLOAT: pa.float64(),
-            define.DataType.INTEGER: pa.int64(),
-            define.DataType.STRING: pa.string(),
-            # A better fitting type would be `pa.duration("ns")`,
-            # but this is not yet supported
-            # when reading CSV files
-            define.DataType.TIME: pa.string(),
-        }
-        # Index
-        for level, dtype in self.levels.items():
-            if dtype in to_pyarrow_dtype:
-                pyarrow_dtypes.append((level, to_pyarrow_dtype[dtype]))
-        # Columns
-        for column_id, column in self.columns.items():
-            if column.scheme_id is not None:
-                dtype = self.db.schemes[column.scheme_id].dtype
-                if dtype in to_pyarrow_dtype:
-                    pyarrow_dtypes.append((column_id, to_pyarrow_dtype[dtype]))
-
-        # Read CSV file
         table = csv.read_csv(
             path,
             read_options=csv.ReadOptions(
@@ -880,7 +847,7 @@ def _load_csv(self, path: str):
                 skip_rows=1,
             ),
             convert_options=csv.ConvertOptions(
-                column_types=pa.schema(pyarrow_dtypes),
+                column_types=self._pyarrow_csv_schema(),
                 strings_can_be_null=True,
             ),
         )
@@ -1024,6 +991,55 @@ def _pyarrow_convert_dtypes(
             df[column] = df[column].astype(dtype)
         return df
 
+    def _pyarrow_csv_schema(self) -> pa.Schema:
+        r"""Data type mapping for reading CSV file with pyarrow.
+
+        This provides a schema,
+        defining pyarrow dtypes
+        for the columns of a CSV file.
+
+        The dtypes are extracted from the audformat schemes,
+        and converted to the pyarrow dtypes.
+
+        Returns:
+            pyarrow schema for reading a CSV file
+
+        """
+        # Mapping from audformat to pyarrow dtypes
+        to_pyarrow_dtype = {
+            define.DataType.BOOL: pa.bool_(),
+            define.DataType.DATE: pa.timestamp("ns"),
+            define.DataType.FLOAT: pa.float64(),
+            define.DataType.INTEGER: pa.int64(),
+            define.DataType.STRING: pa.string(),
+            # A better fitting type would be `pa.duration("ns")`,
+            # but this is not yet supported
+            # when reading CSV files
+            define.DataType.TIME: pa.string(),
+        }
+
+        # Collect pyarrow dtypes
+        # of all columns,
+        # including index columns.
+        # The dtypes are stored as a tuple
+        # ``(column, dtype)``,
+        # and are used to create
+        # the pyarrow.Schema
+        # used when reading the CSV file
+        pyarrow_dtypes = []
+        # Index
+        for level, dtype in self.levels.items():
+            if dtype in to_pyarrow_dtype:
+                pyarrow_dtypes.append((level, to_pyarrow_dtype[dtype]))
+        # Columns
+        for column_id, column in self.columns.items():
+            if column.scheme_id is not None:
+                dtype = self.db.schemes[column.scheme_id].dtype
+                if dtype in to_pyarrow_dtype:
+                    pyarrow_dtypes.append((column_id, to_pyarrow_dtype[dtype]))
+
+        return pa.schema(pyarrow_dtypes)
+
     def _pyarrow_table_to_dataframe(
         self,
         table: pa.Table,

From c89bc33bfd0a9fec895e211c4524f73df96bb44f Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 12 Jun 2024 09:02:22 +0200
Subject: [PATCH 19/67] Fix typo in docstring

---
 audformat/core/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/audformat/core/utils.py b/audformat/core/utils.py
index 4e5d6015..19b33a74 100644
--- a/audformat/core/utils.py
+++ b/audformat/core/utils.py
@@ -2052,7 +2052,7 @@ def _is_same_dtype(d1, d2) -> bool:
 
 
 def _levels(obj):
-    r"""List of dtypes of object."""
+    r"""List of levels of object."""
     if isinstance(obj, pd.MultiIndex):
         return list(obj.names)
     else:

From e485d573dadf63e5bd71103e409e958bd3ad8bd1 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 12 Jun 2024 09:06:35 +0200
Subject: [PATCH 20/67] Remove levels attribute

---
 audformat/core/table.py | 60 +++++++++++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 14 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 10927505..4747bd6a 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -826,6 +826,19 @@ def _get_by_index(
         # Returns `df, df_is_copy`
         raise NotImplementedError()
 
+    def _levels_and_dtypes(self) -> typing.Dict[str, str]:
+        r"""Levels and dtypes of index columns.
+
+        Returns:
+            dictionary with index levels (column names)
+            and associated audformat data type
+
+        """
+        # The returned dictionary is used
+        # to infer index column names and dtypes
+        # when reading CSV files.
+        raise NotImplementedError()  # pragma: no cover
+
     def _load_csv(self, path: str):
         r"""Load table from CSV file.
 
@@ -840,10 +853,12 @@ def _load_csv(self, path: str):
             path: path to table, including file extension
 
         """
+        levels = list(self._levels_and_dtypes().keys())
+        columns = list(self.columns.keys())
         table = csv.read_csv(
             path,
             read_options=csv.ReadOptions(
-                column_names=list(self.levels.keys()) + list(self.columns.keys()),
+                column_names=levels + columns,
                 skip_rows=1,
             ),
             convert_options=csv.ConvertOptions(
@@ -939,8 +954,8 @@ def _pyarrow_convert_dtypes(
         index_columns = []
 
         # --- Index ---
-        index_columns += list(self.levels.keys())
-        for level, dtype in self.levels.items():
+        index_columns += list(self._levels_and_dtypes.keys())
+        for level, dtype in self._levels_and_dtypes.items():
             if dtype == define.DataType.BOOL:
                 bool_columns.append(level)
             elif dtype == define.DataType.INTEGER:
@@ -1028,7 +1043,7 @@ def _pyarrow_csv_schema(self) -> pa.Schema:
         # used when reading the CSV file
         pyarrow_dtypes = []
         # Index
-        for level, dtype in self.levels.items():
+        for level, dtype in self._levels_and_dtypes.items():
             if dtype in to_pyarrow_dtype:
                 pyarrow_dtypes.append((level, to_pyarrow_dtype[dtype]))
         # Columns
@@ -1066,7 +1081,8 @@ def _pyarrow_table_to_dataframe(
         )
         # Adjust dtypes and set index
         df = self._pyarrow_convert_dtypes(df, convert_all=from_csv)
-        df = self._set_index(df, list(self.levels.keys()))
+        index_columns = list(self._levels_and_dtypes.keys())
+        df = self._set_index(df, index_columns)
         return df
 
     def _save_csv(self, path: str):
@@ -1315,6 +1331,16 @@ def __init__(
     def _get_by_index(self, index: pd.Index) -> pd.DataFrame:
         return self.df.loc[index]
 
+    def _levels_and_dtypes(self) -> typing.Dict[str, str]:
+        r"""Levels and dtypes of index columns.
+
+        Returns:
+            dictionary with index levels (column names)
+            and associated audformat data type
+
+        """
+        return self.levels
+
 
 class Table(Base):
     r"""Table conform to :ref:`table specifications <data-tables:Tables>`.
@@ -1446,15 +1472,6 @@ def __init__(
 
         """
 
-        levels = {}
-        levels[define.IndexField.FILE] = define.DataType.STRING
-        if self.type == define.IndexType.SEGMENTED:
-            levels[define.IndexField.START] = define.DataType.TIME
-            levels[define.IndexField.END] = define.DataType.TIME
-
-        self.levels = levels
-        r"""Index levels."""
-
         super().__init__(
             index,
             split_id=split_id,
@@ -1725,6 +1742,21 @@ def _get_by_index(
 
         return result
 
+    def _levels_and_dtypes(self) -> typing.Dict[str, str]:
+        r"""Levels and dtypes of index columns.
+
+        Returns:
+            dictionary with index levels (column names)
+            and associated audformat data type
+
+        """
+        levels_and_dtypes = {}
+        levels_and_dtypes[define.IndexField.FILE] = define.DataType.STRING
+        if self.type == define.IndexType.SEGMENTED:
+            levels_and_dtypes[define.IndexField.START] = define.DataType.TIME
+            levels_and_dtypes[define.IndexField.END] = define.DataType.TIME
+        return levels_and_dtypes
+
 
 def _assert_table_index(
     table: Base,

From 2a359f1bce9ed3396374eebd0bfe0adb4a9b6b04 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 12 Jun 2024 09:52:16 +0200
Subject: [PATCH 21/67] Merge stash

---
 audformat/core/table.py |  3 +--
 audformat/core/utils.py | 59 ++++++++++++++++++++++++++++++-----------
 2 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 4747bd6a..f95f5631 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1316,8 +1316,7 @@ def __init__(
                     f"{levels}, "
                     f"but names must be non-empty and unique."
                 )
-
-            dtypes = [to_audformat_dtype(dtype) for dtype in utils._dtypes(index)]
+            dtypes = utils._audformat_dtypes(index)
             self.levels = {level: dtype for level, dtype in zip(levels, dtypes)}
 
         super().__init__(
diff --git a/audformat/core/utils.py b/audformat/core/utils.py
index 19b33a74..fdb0b411 100644
--- a/audformat/core/utils.py
+++ b/audformat/core/utils.py
@@ -929,8 +929,7 @@ def is_index_alike(
     # check dtypes
     dtypes = set()
     for obj in objs:
-        ds = [to_audformat_dtype(dtype) for dtype in _dtypes(obj)]
-        dtypes.add(tuple(ds))
+        dtypes.add(tuple(_audformat_dtypes(obj)))
     if len(dtypes) > 1:
         return False
 
@@ -2017,7 +2016,7 @@ def _assert_index_alike(
 
     dtypes = []
     for obj in objs:
-        ds = [to_audformat_dtype(dtype) for dtype in _dtypes(obj)]
+        ds = _audformat_dtypes(obj)
         dtypes.append(tuple(ds) if len(ds) > 1 else ds[0])
     dtypes = list(dict.fromkeys(dtypes))
     if len(dtypes) > 1:
@@ -2026,12 +2025,18 @@ def _assert_index_alike(
     raise ValueError(msg)
 
 
-def _dtypes(obj):
-    r"""List of dtypes of object."""
-    if isinstance(obj, pd.MultiIndex):
-        return list(obj.dtypes)
-    else:
-        return [obj.dtype]
+def _audformat_dtypes(index) -> typing.List[str]:
+    r"""List of audformat data types of index.
+
+    Args:
+        index: index
+
+    Returns:
+        audformat data types of index
+
+    """
+    dtypes = _pandas_dtypes(index)
+    return [to_audformat_dtype(dtype) for dtype in dtypes]
 
 
 def _is_same_dtype(d1, d2) -> bool:
@@ -2051,12 +2056,20 @@ def _is_same_dtype(d1, d2) -> bool:
     return d1.name == d2.name
 
 
-def _levels(obj):
-    r"""List of levels of object."""
-    if isinstance(obj, pd.MultiIndex):
-        return list(obj.names)
+def _levels(index) -> typing.List[str]:
+    r"""List of levels of index.
+
+    Args:
+        index: index
+
+    Returns:
+        index levels
+
+    """
+    if isinstance(index, pd.MultiIndex):
+        return list(index.names)
     else:
-        return [obj.name]
+        return [index.name]
 
 
 def _maybe_convert_filewise_index(
@@ -2101,7 +2114,7 @@ def _maybe_convert_pandas_dtype(
 
     """
     levels = _levels(index)
-    dtypes = _dtypes(index)
+    dtypes = _pandas_dtypes(index)
 
     # Ensure integers are stored as Int64
     int_dtypes = {
@@ -2152,3 +2165,19 @@ def _maybe_convert_single_level_multi_index(
                 objs[idx].index = obj.index.get_level_values(0)
 
     return objs
+
+
+def _pandas_dtypes(index) -> typing.List[typing.Any]:
+    r"""List of pandas dtypes of index.
+
+    Args:
+        index: index
+
+    Returns:
+        pandas data types of index
+
+    """
+    if isinstance(index, pd.MultiIndex):
+        return list(index.dtypes)
+    else:
+        return [index.dtype]

From 01678d9ead14b738efe34a6407443cecb304ef0f Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 12 Jun 2024 09:52:56 +0200
Subject: [PATCH 22/67] Remove levels from doctest output

---
 audformat/core/database.py | 1 -
 audformat/core/table.py    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/audformat/core/database.py b/audformat/core/database.py
index 8c26bedf..5eb72e68 100644
--- a/audformat/core/database.py
+++ b/audformat/core/database.py
@@ -121,7 +121,6 @@ class Database(HeaderBase):
         tables:
           table:
             type: filewise
-            levels: {file: str}
             media_id: audio
             columns:
               column: {scheme_id: emotion, rater_id: rater}
diff --git a/audformat/core/table.py b/audformat/core/table.py
index f95f5631..1a14538d 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1379,7 +1379,6 @@ class Table(Base):
         >>> table["values"] = Column()
         >>> table
         type: filewise
-        levels: {file: str}
         split_id: test
         columns:
           values: {}

From 92306d8f72a9fc4faa5a91f661d9ec160510fd73 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 12 Jun 2024 09:57:58 +0200
Subject: [PATCH 23/67] Convert method to property

---
 audformat/core/table.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 1a14538d..a353505f 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -826,6 +826,7 @@ def _get_by_index(
         # Returns `df, df_is_copy`
         raise NotImplementedError()
 
+    @property
     def _levels_and_dtypes(self) -> typing.Dict[str, str]:
         r"""Levels and dtypes of index columns.
 
@@ -837,6 +838,9 @@ def _levels_and_dtypes(self) -> typing.Dict[str, str]:
         # The returned dictionary is used
         # to infer index column names and dtypes
         # when reading CSV files.
+        # This means the names and dtypes cannot be inferred
+        # from the index itself,
+        # but need to be known before.
         raise NotImplementedError()  # pragma: no cover
 
     def _load_csv(self, path: str):
@@ -853,7 +857,7 @@ def _load_csv(self, path: str):
             path: path to table, including file extension
 
         """
-        levels = list(self._levels_and_dtypes().keys())
+        levels = list(self._levels_and_dtypes.keys())
         columns = list(self.columns.keys())
         table = csv.read_csv(
             path,
@@ -1330,6 +1334,7 @@ def __init__(
     def _get_by_index(self, index: pd.Index) -> pd.DataFrame:
         return self.df.loc[index]
 
+    @property
     def _levels_and_dtypes(self) -> typing.Dict[str, str]:
         r"""Levels and dtypes of index columns.
 
@@ -1740,6 +1745,7 @@ def _get_by_index(
 
         return result
 
+    @property
     def _levels_and_dtypes(self) -> typing.Dict[str, str]:
         r"""Levels and dtypes of index columns.
 

From 2b727b9ebd5061721fddf77b14607d298957070b Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 12 Jun 2024 09:58:38 +0200
Subject: [PATCH 24/67] Add comment

---
 audformat/core/table.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index a353505f..ac0be0e8 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -954,7 +954,8 @@ def _pyarrow_convert_dtypes(
         labeled_columns = []
 
         # Collect columns,
-        # belonging to the index
+        # belonging to the table index
+        # (not the index of the provided dataframe)
         index_columns = []
 
         # --- Index ---

From ec50279229aeacff6562ef6405446660b5f6fb6f Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 11 Jun 2024 15:33:15 +0200
Subject: [PATCH 25/67] Simplify code

---
 audformat/core/table.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index ac0be0e8..815f7f4a 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1099,12 +1099,8 @@ def _save_csv(self, path: str):
             df.to_csv(fp, encoding="utf-8")
 
     def _save_parquet(self, path: str):
-        # Load table before opening PARQUET file
-        # to avoid creating a PARQUET file
-        # that is newer than the PKL file
-        df = self.df  # loads table
         table = pa.Table.from_pandas(
-            df.reset_index(),
+            self.df.reset_index(),
             preserve_index=False,
             # TODO: check if faster when providing schema?
             # schema=self._schema,

From f6820ea3fbea869b7716fcacb7331b14fdcca63a Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 11 Jun 2024 15:47:31 +0200
Subject: [PATCH 26/67] Simplify code

---
 audformat/core/table.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 815f7f4a..5eff43c1 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1099,12 +1099,7 @@ def _save_csv(self, path: str):
             df.to_csv(fp, encoding="utf-8")
 
     def _save_parquet(self, path: str):
-        table = pa.Table.from_pandas(
-            self.df.reset_index(),
-            preserve_index=False,
-            # TODO: check if faster when providing schema?
-            # schema=self._schema,
-        )
+        table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False)
         parquet.write_table(table, path)
 
     def _save_pickled(self, path: str):

From fe50e53e441f566ceb62b46179e2bae1e120fd7f Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 12 Jun 2024 12:18:38 +0200
Subject: [PATCH 27/67] Add test for md5sum of parquet file

---
 tests/test_table.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/test_table.py b/tests/test_table.py
index d4ece3d9..1dc5addc 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1208,6 +1208,35 @@ def test_map(table, map):
     pd.testing.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "table_id, expected_md5sum",
+    [
+        ("files", "a856aef8ec9d5e4b1752a13ad68cc0c2"),
+    ],
+)
+def test_parquet_reproducibility(tmpdir, table_id, expected_md5sum):
+    r"""Test reproducibility of binary PARQUET files.
+
+    When storing the same dataframe
+    to different PARQUET files,
+    those files should have an identical
+    MD5sum,
+    which should also be reproducible
+    across different pandas and pyarrow versions.
+
+    """
+    db = audformat.testing.create_db()
+    path_wo_ext = audeer.path(tmpdir, table_id)
+    path = f"{path_wo_ext}.parquet"
+    db[table_id].save(path_wo_ext, storage_format="parquet")
+    assert audeer.md5(path) == expected_md5sum
+    # Repeat writing after loading table
+    db[table_id].load(path_wo_ext)
+    os.remove(path)
+    db[table_id].save(path_wo_ext, storage_format="parquet")
+    assert audeer.md5(path) == expected_md5sum
+
+
 @pytest.mark.parametrize(
     "files",
     [

From f9d564e1d15865eec6ac8c8d7dccdd19725ded02 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 12 Jun 2024 12:48:05 +0200
Subject: [PATCH 28/67] Switch back to snappy compression

---
 audformat/core/table.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 5eff43c1..0c4b1b73 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1100,7 +1100,7 @@ def _save_csv(self, path: str):
 
     def _save_parquet(self, path: str):
         table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False)
-        parquet.write_table(table, path)
+        parquet.write_table(table, path, compression="snappy")
 
     def _save_pickled(self, path: str):
         self.df.to_pickle(

From c53d8cc99e495cb6b48072115a1598f818a1fd62 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 12 Jun 2024 12:48:52 +0200
Subject: [PATCH 29/67] Fix linter

---
 audformat/core/table.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 0c4b1b73..27aaa2be 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -17,7 +17,6 @@
 from audformat.core.column import Column
 from audformat.core.common import HeaderBase
 from audformat.core.common import HeaderDict
-from audformat.core.common import to_audformat_dtype
 from audformat.core.errors import BadIdError
 from audformat.core.index import filewise_index
 from audformat.core.index import index_type

From 0636a302cd73acc99b5e063caddccb16c2347391 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 12 Jun 2024 14:23:40 +0200
Subject: [PATCH 30/67] Store hash inside parquet file

---
 audformat/core/table.py | 32 ++++++++++++++++++++++++++++++++
 tests/test_table.py     | 38 ++++++++++++++++++++++++++++----------
 2 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 27aaa2be..be9b030f 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1099,6 +1099,38 @@ def _save_csv(self, path: str):
 
     def _save_parquet(self, path: str):
         table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False)
+
+        # Add hash of dataframe
+        # to the metadata,
+        # which pyarrow stores inside the schema.
+        # See https://stackoverflow.com/a/58978449
+        try:
+            metadata = {"hash": utils.hash(self.df)}
+        except TypeError:
+            # Levels/columns with dtype "object" might not be hashable,
+            # e.g. when storing numpy arrays.
+            # We convert them to strings in this case.
+            #
+            # Index
+            df = self.df.copy()
+            update_index_dtypes = {
+                level: "string"
+                for level, dtype in self._levels_and_dtypes.items()
+                if dtype == define.DataType.OBJECT
+            }
+            df.index = utils.set_index_dtypes(df.index, update_index_dtypes)
+            # Columns
+            for column_id, column in self.columns.items():
+                if column.scheme_id is not None:
+                    scheme = self.db.schemes[column.scheme_id]
+                    if scheme.dtype == define.DataType.OBJECT:
+                        df[column_id] = df[column_id].astype("string")
+                else:
+                    # No scheme defaults to `object` dtype
+                    df[column_id] = df[column_id].astype("string")
+            metadata = {"hash": utils.hash(df)}
+
+        table = table.replace_schema_metadata({**metadata, **table.schema.metadata})
         parquet.write_table(table, path, compression="snappy")
 
     def _save_pickled(self, path: str):
diff --git a/tests/test_table.py b/tests/test_table.py
index 1dc5addc..2800cf0d 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1,9 +1,11 @@
 import os
+import random
 import re
 import typing
 
 import numpy as np
 import pandas as pd
+import pyarrow.parquet as parquet
 import pytest
 
 import audeer
@@ -1209,32 +1211,48 @@ def test_map(table, map):
 
 
 @pytest.mark.parametrize(
-    "table_id, expected_md5sum",
+    "table_id, expected_hash",
     [
-        ("files", "a856aef8ec9d5e4b1752a13ad68cc0c2"),
+        ("files", "-4778271914368537359"),
+        ("segments", "6154135801036965154"),
+        ("misc", "8941499293930597709"),
     ],
 )
-def test_parquet_reproducibility(tmpdir, table_id, expected_md5sum):
+def test_parquet_reproducibility(tmpdir, table_id, expected_hash):
     r"""Test reproducibility of binary PARQUET files.
 
     When storing the same dataframe
     to different PARQUET files,
-    those files should have an identical
-    MD5sum,
-    which should also be reproducible
-    across different pandas and pyarrow versions.
+    the files will slightly vary
+    and have different MD5 sums.
+
+    To provide a reproducible hash,
+    in order to judge if a table has changed,
+    we calculate the hash of the table
+    and store it in the metadata
+    of the schema
+    of a the table.
 
     """
+    random.seed(1)  # ensure the same random table values are created
     db = audformat.testing.create_db()
+
+    # Check that the output of audfromat.utils.hash() does not change
+    assert audformat.utils.hash(db[table_id].df) == expected_hash
+
+    # Write to PARQUET file and check if correct hash is stored
     path_wo_ext = audeer.path(tmpdir, table_id)
     path = f"{path_wo_ext}.parquet"
     db[table_id].save(path_wo_ext, storage_format="parquet")
-    assert audeer.md5(path) == expected_md5sum
-    # Repeat writing after loading table
+    metadata = parquet.read_schema(path).metadata
+    assert metadata[b"hash"].decode() == expected_hash
+
+    # Load table from PARQUET file, and overwrite it
     db[table_id].load(path_wo_ext)
     os.remove(path)
     db[table_id].save(path_wo_ext, storage_format="parquet")
-    assert audeer.md5(path) == expected_md5sum
+    metadata = parquet.read_schema(path).metadata
+    assert metadata[b"hash"].decode() == expected_hash
 
 
 @pytest.mark.parametrize(

From 77eb826df1dd6c9f41cc71af25a9c5e94d908029 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 12 Jun 2024 14:34:19 +0200
Subject: [PATCH 31/67] Fix code coverage

---
 tests/test_table.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_table.py b/tests/test_table.py
index 2800cf0d..215ae8a0 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1506,6 +1506,8 @@ def test_save_and_load(tmpdir, storage_format):
     db["multi-misc"]["arrays"].set([np.array([0, 1]), np.array([2, 3])])
     db["multi-misc"]["lists"] = audformat.Column(scheme_id="object")
     db["multi-misc"]["lists"].set([[0, 1], [2, 3]])
+    db["multi-misc"]["no-scheme"] = audformat.Column()
+    db["multi-misc"]["no-scheme"].set([0, 1])
 
     for table_id in list(db):
         expected_df = db[table_id].get()

From 4a54cb008a6951c3267e65b733284901b170987c Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 12 Jun 2024 15:41:51 +0200
Subject: [PATCH 32/67] Stay with CSV as default table format

---
 audformat/core/database.py | 2 +-
 audformat/core/table.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/audformat/core/database.py b/audformat/core/database.py
index 5eb72e68..0a84447a 100644
--- a/audformat/core/database.py
+++ b/audformat/core/database.py
@@ -979,7 +979,7 @@ def save(
         r"""Save database to disk.
 
         Creates a header ``<root>/<name>.yaml``
-        and for every table a file ``<root>/<name>.<table-id>.[csv,pkl]``.
+        and for every table a file ``<root>/<name>.<table-id>.[csv,parquet,pkl]``.
 
         Existing files will be overwritten.
         If ``update_other_formats`` is provided,
diff --git a/audformat/core/table.py b/audformat/core/table.py
index be9b030f..cf3f0fcd 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -578,7 +578,7 @@ def save(
         self,
         path: str,
         *,
-        storage_format: str = define.TableStorageFormat.PARQUET,
+        storage_format: str = define.TableStorageFormat.CSV,
         update_other_formats: bool = True,
     ):
         r"""Save table data to disk.

From 13a7769474c03e03c9bd437653134ad970f424fa Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 10:42:15 +0200
Subject: [PATCH 33/67] Test pyarrow==15.0.2

---
 .github/workflows/test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2060e9bd..fe51ddb3 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -47,6 +47,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r tests/requirements.txt
+        pip install "pyarrow==15.0.2"
 
     - name: Test with pytest
       run: |

From 6b07a24e661d13f41b3d6fdbf1dce7b6fec65376 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 10:50:26 +0200
Subject: [PATCH 34/67] Test pyarrow==14.0.2

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index fe51ddb3..5455be87 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -47,7 +47,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r tests/requirements.txt
-        pip install "pyarrow==15.0.2"
+        pip install "pyarrow==14.0.2"
 
     - name: Test with pytest
       run: |

From 563a892ecbdae820809094b6e484f452a3d43598 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 10:54:18 +0200
Subject: [PATCH 35/67] Test pyarrow==13.0

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5455be87..1a2ca341 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -47,7 +47,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r tests/requirements.txt
-        pip install "pyarrow==14.0.2"
+        pip install "pyarrow==13.0"
 
     - name: Test with pytest
       run: |

From 4b451ef64619b33b2dbfc2623f3d367920f1393e Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 10:57:31 +0200
Subject: [PATCH 36/67] Test pyarrow==12.0

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1a2ca341..b18e4863 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -47,7 +47,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r tests/requirements.txt
-        pip install "pyarrow==13.0"
+        pip install "pyarrow==12.0"
 
     - name: Test with pytest
       run: |

From 63188ae16e0f34a3c704cac160f5f05951cf1037 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 11:00:45 +0200
Subject: [PATCH 37/67] Test pyarrow==11.0

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b18e4863..49c7d3f1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -47,7 +47,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r tests/requirements.txt
-        pip install "pyarrow==12.0"
+        pip install "pyarrow==11.0"
 
     - name: Test with pytest
       run: |

From e2eee7fcade1115463a2c7050f7a8c31596ffea2 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 11:03:29 +0200
Subject: [PATCH 38/67] Test pyarrow==10.0

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 49c7d3f1..95f038a5 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -47,7 +47,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r tests/requirements.txt
-        pip install "pyarrow==11.0"
+        pip install "pyarrow==10.0"
 
     - name: Test with pytest
       run: |

From bf8dd5998a6f0a77ad63df20154e267085e7b1a4 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 11:06:11 +0200
Subject: [PATCH 39/67] Test pyarrow==10.0.1

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 95f038a5..20f6abc0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -47,7 +47,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r tests/requirements.txt
-        pip install "pyarrow==10.0"
+        pip install "pyarrow==10.0.1"
 
     - name: Test with pytest
       run: |

From 83cac4f0464d498b8d0bd1f453f94b294516d003 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 11:09:45 +0200
Subject: [PATCH 40/67] Require pyarrow>=10.0.1

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 14ad2128..b007b679 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,7 @@ dependencies = [
     'iso-639',
     'iso3166',
     'oyaml',
-    'pyarrow',
+    'pyarrow >=10.0.1',  # for pyarrow strings in pandas
     'pyyaml >=5.4.1',
     'pandas >=2.1.0',
 ]

From c78da845add0bd12ee26bbfd1770892d60b59634 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 11:29:37 +0200
Subject: [PATCH 41/67] Test pandas<2.1.0

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 20f6abc0..1e12c40d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -47,7 +47,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r tests/requirements.txt
-        pip install "pyarrow==10.0.1"
+        pip install "pandas<2.1.0"
 
     - name: Test with pytest
       run: |

From 263f97026d556f75104b4ced538d60440ce5b4fd Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 11:33:46 +0200
Subject: [PATCH 42/67] Add explanations for requirements

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b007b679..327844de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ classifiers = [
     'Programming Language :: Python :: 3.11',
     'Topic :: Scientific/Engineering',
 ]
-requires-python = '>=3.9'
+requires-python = '>=3.9'  # pandas >=2.1.0
 dependencies = [
     'audeer >=2.0.0',
     'audiofile >=0.4.0',
@@ -37,7 +37,7 @@ dependencies = [
     'oyaml',
     'pyarrow >=10.0.1',  # for pyarrow strings in pandas
     'pyyaml >=5.4.1',
-    'pandas >=2.1.0',
+    'pandas >=2.1.0',  # support <NA> in timedelta
 ]
 # Get version dynamically from git
 # (needs setuptools_scm tools config below)

From d51d01db2f49e723e0135562d92dc8d03a203205 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 11:40:07 +0200
Subject: [PATCH 43/67] Add test using minimum pip requirements

---
 .github/workflows/test.yml | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1e12c40d..dcfede44 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -14,11 +14,15 @@ jobs:
       matrix:
         os: [ ubuntu-20.04, windows-latest, macOS-latest ]
         python-version: [ '3.10' ]
+        requirements: [ 'newest' ]
         include:
           - os: ubuntu-latest
             python-version: '3.9'
           - os: ubuntu-latest
             python-version: '3.11'
+          - os: ubuntu-latest
+            python-version: '3.9'
+            requirements: 'minimum'
 
     steps:
     - uses: actions/checkout@v4
@@ -47,7 +51,15 @@ jobs:
         python -m pip install --upgrade pip
         pip install -r requirements.txt
         pip install -r tests/requirements.txt
-        pip install "pandas<2.1.0"
+
+    - name: Downgrade to minimum dependencies
+      run: |
+        pip install "audeer==2.0.0"
+        pip install "audiofile>=0.4.0"
+        pip install "pandas==2.1.0"
+        pip install "pyarrow==10.0.1"
+        pip install "pyyaml==5.4.1"
+      if: matrix.requirements == 'minimum'
 
     - name: Test with pytest
       run: |

From f889b75ab9add2e4167b3123ae98d53b630d5d87 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 11:55:10 +0200
Subject: [PATCH 44/67] Fix alphabetical order of requirements

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 327844de..6e4bc361 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,9 +35,9 @@ dependencies = [
     'iso-639',
     'iso3166',
     'oyaml',
+    'pandas >=2.1.0',  # support <NA> in timedelta
     'pyarrow >=10.0.1',  # for pyarrow strings in pandas
     'pyyaml >=5.4.1',
-    'pandas >=2.1.0',  # support <NA> in timedelta
 ]
 # Get version dynamically from git
 # (needs setuptools_scm tools config below)

From 96df9ac54b0933e4d63c82c2e27d5e2ad537c358 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 11:55:25 +0200
Subject: [PATCH 45/67] Enhance test matrix definition

---
 .github/workflows/test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index dcfede44..992a7b4b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -14,7 +14,6 @@ jobs:
       matrix:
         os: [ ubuntu-20.04, windows-latest, macOS-latest ]
         python-version: [ '3.10' ]
-        requirements: [ 'newest' ]
         include:
           - os: ubuntu-latest
             python-version: '3.9'

From f37de7e51e63c208c5c999133948d61e71ff430d Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 11:55:50 +0200
Subject: [PATCH 46/67] Debug failing test

---
 .github/workflows/test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 992a7b4b..c5e3bb18 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -53,11 +53,11 @@ jobs:
 
     - name: Downgrade to minimum dependencies
       run: |
-        pip install "audeer==2.0.0"
-        pip install "audiofile>=0.4.0"
+        # pip install "audeer==2.0.0"
+        # pip install "audiofile>=0.4.0"
         pip install "pandas==2.1.0"
         pip install "pyarrow==10.0.1"
-        pip install "pyyaml==5.4.1"
+        # pip install "pyyaml==5.4.1"
       if: matrix.requirements == 'minimum'
 
     - name: Test with pytest

From 17ea1d9d524f9118aee57346bc5b41daac689ebd Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 16:28:08 +0200
Subject: [PATCH 47/67] Test different hash method

---
 audformat/core/table.py | 64 +++++++++++++++++++++++++----------------
 tests/test_table.py     | 17 ++++++++---
 2 files changed, 52 insertions(+), 29 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index cf3f0fcd..5489ba14 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1,6 +1,8 @@
 from __future__ import annotations  # allow typing without string
 
 import copy
+import hashlib
+import io
 import os
 import pickle
 import typing
@@ -1100,35 +1102,47 @@ def _save_csv(self, path: str):
     def _save_parquet(self, path: str):
         table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False)
 
+        # audformat.utils.hash() cannot be used due to:
+        # * https://github.com/audeering/audformat/issues/434
+        # * https://github.com/audeering/audformat/issues/433
+        # # Add hash of dataframe
+        # # to the metadata,
+        # # which pyarrow stores inside the schema.
+        # # See https://stackoverflow.com/a/58978449
+        # try:
+        #     metadata = {"hash": utils.hash(self.df)}
+        # except TypeError:
+        #     # Levels/columns with dtype "object" might not be hashable,
+        #     # e.g. when storing numpy arrays.
+        #     # We convert them to strings in this case.
+        #     #
+        #     # Index
+        #     df = self.df.copy()
+        #     update_index_dtypes = {
+        #         level: "string"
+        #         for level, dtype in self._levels_and_dtypes.items()
+        #         if dtype == define.DataType.OBJECT
+        #     }
+        #     df.index = utils.set_index_dtypes(df.index, update_index_dtypes)
+        #     # Columns
+        #     for column_id, column in self.columns.items():
+        #         if column.scheme_id is not None:
+        #             scheme = self.db.schemes[column.scheme_id]
+        #             if scheme.dtype == define.DataType.OBJECT:
+        #                 df[column_id] = df[column_id].astype("string")
+        #         else:
+        #             # No scheme defaults to `object` dtype
+        #             df[column_id] = df[column_id].astype("string")
+        #     metadata = {"hash": utils.hash(df)}
+
         # Add hash of dataframe
         # to the metadata,
         # which pyarrow stores inside the schema.
         # See https://stackoverflow.com/a/58978449
-        try:
-            metadata = {"hash": utils.hash(self.df)}
-        except TypeError:
-            # Levels/columns with dtype "object" might not be hashable,
-            # e.g. when storing numpy arrays.
-            # We convert them to strings in this case.
-            #
-            # Index
-            df = self.df.copy()
-            update_index_dtypes = {
-                level: "string"
-                for level, dtype in self._levels_and_dtypes.items()
-                if dtype == define.DataType.OBJECT
-            }
-            df.index = utils.set_index_dtypes(df.index, update_index_dtypes)
-            # Columns
-            for column_id, column in self.columns.items():
-                if column.scheme_id is not None:
-                    scheme = self.db.schemes[column.scheme_id]
-                    if scheme.dtype == define.DataType.OBJECT:
-                        df[column_id] = df[column_id].astype("string")
-                else:
-                    # No scheme defaults to `object` dtype
-                    df[column_id] = df[column_id].astype("string")
-            metadata = {"hash": utils.hash(df)}
+        buffer = io.BytesIO()
+        self.df.to_parquet(buffer)
+        hash_df = hashlib.sha256(buffer.getbuffer()).hexdigest()
+        metadata = {"hash": hash_df}
 
         table = table.replace_schema_metadata({**metadata, **table.schema.metadata})
         parquet.write_table(table, path, compression="snappy")
diff --git a/tests/test_table.py b/tests/test_table.py
index 215ae8a0..fb8a03ac 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1213,9 +1213,18 @@ def test_map(table, map):
 @pytest.mark.parametrize(
     "table_id, expected_hash",
     [
-        ("files", "-4778271914368537359"),
-        ("segments", "6154135801036965154"),
-        ("misc", "8941499293930597709"),
+        (
+            "files",
+            "b079f9c2331d924a0388dde079cde55c7dcf6bf2bae851d77dc5cba5b33c31e1",
+        ),
+        (
+            "segments",
+            "741e139f7adae5199539ec8260f3a55a868038865a3f5a385ea172a5ca72960b",
+        ),
+        (
+            "misc",
+            "cb09eb7d3adaf7d45dfff0606c6ab61a1a03333aa1b8351febbba20d8c22a63d",
+        ),
     ],
 )
 def test_parquet_reproducibility(tmpdir, table_id, expected_hash):
@@ -1238,7 +1247,7 @@ def test_parquet_reproducibility(tmpdir, table_id, expected_hash):
     db = audformat.testing.create_db()
 
     # Check that the output of audfromat.utils.hash() does not change
-    assert audformat.utils.hash(db[table_id].df) == expected_hash
+    # assert audformat.utils.hash(db[table_id].df) == expected_hash
 
     # Write to PARQUET file and check if correct hash is stored
     path_wo_ext = audeer.path(tmpdir, table_id)

From 495e09514de40721e21ee432c1ccf5011527c651 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Thu, 13 Jun 2024 17:06:41 +0200
Subject: [PATCH 48/67] Use different hashing approach

---
 audformat/core/table.py | 48 +++++++++++++++++++++++++++++++++++------
 tests/test_table.py     |  6 +++---
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 5489ba14..f71be8c8 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -2,7 +2,6 @@
 
 import copy
 import hashlib
-import io
 import os
 import pickle
 import typing
@@ -1100,7 +1099,9 @@ def _save_csv(self, path: str):
             df.to_csv(fp, encoding="utf-8")
 
     def _save_parquet(self, path: str):
-        table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False)
+        df = self.df.reset_index()
+
+        table = pa.Table.from_pandas(df, preserve_index=False)
 
         # audformat.utils.hash() cannot be used due to:
         # * https://github.com/audeering/audformat/issues/434
@@ -1138,11 +1139,44 @@ def _save_parquet(self, path: str):
         # Add hash of dataframe
         # to the metadata,
         # which pyarrow stores inside the schema.
-        # See https://stackoverflow.com/a/58978449
-        buffer = io.BytesIO()
-        self.df.to_parquet(buffer)
-        hash_df = hashlib.sha256(buffer.getbuffer()).hexdigest()
-        metadata = {"hash": hash_df}
+        # See https://stackoverflow.com/a/58978449.
+        #
+        # The hashing method was suggested at
+        # https://github.com/pandas-dev/pandas/issues/46705#issuecomment-1094123442
+        # as pandas.util.hash_pandas_object()
+        # ignores column and index names
+        # buffer = io.BytesIO()
+        # self.df.to_parquet(buffer)
+        # hash_df = hashlib.sha256(buffer.getbuffer()).hexdigest()
+        # metadata = {"hash": hash_df}
+        schema_str = table.schema.to_string(
+            show_field_metadata=False,
+            show_schema_metadata=False,
+        )
+        try:
+            hash_data = utils.hash(df)
+        except TypeError:
+            # Levels/columns with dtype "object" might not be hashable,
+            # e.g. when storing numpy arrays.
+            # We convert them to strings in this case.
+            #
+            # Index
+            for level, dtype in self._levels_and_dtypes.items():
+                if dtype == define.DataType.OBJECT:
+                    df[level] = df[level].astype("string")
+            # Columns
+            for column_id, column in self.columns.items():
+                if column.scheme_id is not None:
+                    scheme = self.db.schemes[column.scheme_id]
+                    if scheme.dtype == define.DataType.OBJECT:
+                        df[column_id] = df[column_id].astype("string")
+                else:
+                    # No scheme defaults to `object` dtype
+                    df[column_id] = df[column_id].astype("string")
+            hash_data = utils.hash(df)
+
+        hash_table = hashlib.sha256((hash_data + schema_str).encode()).hexdigest()
+        metadata = {"hash": hash_table}
 
         table = table.replace_schema_metadata({**metadata, **table.schema.metadata})
         parquet.write_table(table, path, compression="snappy")
diff --git a/tests/test_table.py b/tests/test_table.py
index fb8a03ac..45c5cf71 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1215,15 +1215,15 @@ def test_map(table, map):
     [
         (
             "files",
-            "b079f9c2331d924a0388dde079cde55c7dcf6bf2bae851d77dc5cba5b33c31e1",
+            "a6031ff402141834ec9ca3886e8672261a2671b534aaae798cf5918f12b9db14",
         ),
         (
             "segments",
-            "741e139f7adae5199539ec8260f3a55a868038865a3f5a385ea172a5ca72960b",
+            "8bb0c5da4aaf1c4b145361a1542ebd2f3857fabc6fdc3cf80deba1307109f5dc",
         ),
         (
             "misc",
-            "cb09eb7d3adaf7d45dfff0606c6ab61a1a03333aa1b8351febbba20d8c22a63d",
+            "ecc24f9ab8c25995017396f363987990d7421507532ee78da57cab0ca2e4b680",
         ),
     ],
 )

From f374fe0aa0518b1e89b63a2824176e4d8ac6fffb Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Fri, 14 Jun 2024 11:46:20 +0200
Subject: [PATCH 49/67] Require pandas>=2.2.0 and fix hashes

---
 .github/workflows/test.yml |  2 +-
 audformat/core/table.py    | 76 ++++++++++++++------------------------
 pyproject.toml             |  4 +-
 tests/test_table.py        |  6 +--
 4 files changed, 33 insertions(+), 55 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index c5e3bb18..e728034b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -55,7 +55,7 @@ jobs:
       run: |
         # pip install "audeer==2.0.0"
         # pip install "audiofile>=0.4.0"
-        pip install "pandas==2.1.0"
+        pip install "pandas==2.2.0"
         pip install "pyarrow==10.0.1"
         # pip install "pyyaml==5.4.1"
       if: matrix.requirements == 'minimum'
diff --git a/audformat/core/table.py b/audformat/core/table.py
index f71be8c8..a78529cc 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1103,67 +1103,44 @@ def _save_parquet(self, path: str):
 
         table = pa.Table.from_pandas(df, preserve_index=False)
 
-        # audformat.utils.hash() cannot be used due to:
-        # * https://github.com/audeering/audformat/issues/434
-        # * https://github.com/audeering/audformat/issues/433
-        # # Add hash of dataframe
-        # # to the metadata,
-        # # which pyarrow stores inside the schema.
-        # # See https://stackoverflow.com/a/58978449
-        # try:
-        #     metadata = {"hash": utils.hash(self.df)}
-        # except TypeError:
-        #     # Levels/columns with dtype "object" might not be hashable,
-        #     # e.g. when storing numpy arrays.
-        #     # We convert them to strings in this case.
-        #     #
-        #     # Index
-        #     df = self.df.copy()
-        #     update_index_dtypes = {
-        #         level: "string"
-        #         for level, dtype in self._levels_and_dtypes.items()
-        #         if dtype == define.DataType.OBJECT
-        #     }
-        #     df.index = utils.set_index_dtypes(df.index, update_index_dtypes)
-        #     # Columns
-        #     for column_id, column in self.columns.items():
-        #         if column.scheme_id is not None:
-        #             scheme = self.db.schemes[column.scheme_id]
-        #             if scheme.dtype == define.DataType.OBJECT:
-        #                 df[column_id] = df[column_id].astype("string")
-        #         else:
-        #             # No scheme defaults to `object` dtype
-        #             df[column_id] = df[column_id].astype("string")
-        #     metadata = {"hash": utils.hash(df)}
-
         # Add hash of dataframe
         # to the metadata,
         # which pyarrow stores inside the schema.
         # See https://stackoverflow.com/a/58978449.
         #
-        # The hashing method was suggested at
-        # https://github.com/pandas-dev/pandas/issues/46705#issuecomment-1094123442
-        # as pandas.util.hash_pandas_object()
-        # ignores column and index names
-        # buffer = io.BytesIO()
-        # self.df.to_parquet(buffer)
-        # hash_df = hashlib.sha256(buffer.getbuffer()).hexdigest()
-        # metadata = {"hash": hash_df}
+        # This allows us to track if a PARQUET file changes over time.
+        # We cannot rely on md5 sums of the file,
+        # as the file is written in a non-deterministic way.
+        table_hash = hashlib.md5()
+
+        # Hash of schema (columns + dtypes)
         schema_str = table.schema.to_string(
+            # schema.metadata contains pandas related information,
+            # and the used pyarrow and pandas version,
+            # and needs to be excluded
             show_field_metadata=False,
             show_schema_metadata=False,
         )
+        schema_hash = hashlib.md5(schema_str.encode())
+        table_hash.update(schema_hash.digest())
+
+        # Hash data
         try:
-            hash_data = utils.hash(df)
+            data_hash = utils.hash(self.df)
         except TypeError:
             # Levels/columns with dtype "object" might not be hashable,
             # e.g. when storing numpy arrays.
             # We convert them to strings in this case.
-            #
+
             # Index
-            for level, dtype in self._levels_and_dtypes.items():
-                if dtype == define.DataType.OBJECT:
-                    df[level] = df[level].astype("string")
+            df = self.df.copy()
+            update_index_dtypes = {
+                level: "string"
+                for level, dtype in self._levels_and_dtypes.items()
+                if dtype == define.DataType.OBJECT
+            }
+            df.index = utils.set_index_dtypes(df.index, update_index_dtypes)
+
             # Columns
             for column_id, column in self.columns.items():
                 if column.scheme_id is not None:
@@ -1173,12 +1150,13 @@ def _save_parquet(self, path: str):
                 else:
                     # No scheme defaults to `object` dtype
                     df[column_id] = df[column_id].astype("string")
-            hash_data = utils.hash(df)
+            data_hash = utils.hash(df)
 
-        hash_table = hashlib.sha256((hash_data + schema_str).encode()).hexdigest()
-        metadata = {"hash": hash_table}
+        table_hash.update(data_hash.encode())
 
+        metadata = {"hash": table_hash.hexdigest()}
         table = table.replace_schema_metadata({**metadata, **table.schema.metadata})
+
         parquet.write_table(table, path, compression="snappy")
 
     def _save_pickled(self, path: str):
diff --git a/pyproject.toml b/pyproject.toml
index 6e4bc361..b0f45140 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,14 +28,14 @@ classifiers = [
     'Programming Language :: Python :: 3.11',
     'Topic :: Scientific/Engineering',
 ]
-requires-python = '>=3.9'  # pandas >=2.1.0
+requires-python = '>=3.9'  # pandas >=2.2.0
 dependencies = [
     'audeer >=2.0.0',
     'audiofile >=0.4.0',
     'iso-639',
     'iso3166',
     'oyaml',
-    'pandas >=2.1.0',  # support <NA> in timedelta
+    'pandas >=2.2.0',  # hash values, see https://github.com/pandas-dev/pandas/issues/58999
     'pyarrow >=10.0.1',  # for pyarrow strings in pandas
     'pyyaml >=5.4.1',
 ]
diff --git a/tests/test_table.py b/tests/test_table.py
index 45c5cf71..57539e00 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1215,15 +1215,15 @@ def test_map(table, map):
     [
         (
             "files",
-            "a6031ff402141834ec9ca3886e8672261a2671b534aaae798cf5918f12b9db14",
+            "4d0295654694751bdcd12be86b89b73e",
         ),
         (
             "segments",
-            "8bb0c5da4aaf1c4b145361a1542ebd2f3857fabc6fdc3cf80deba1307109f5dc",
+            "d2a9b84d03abde24ae84cf647a019b71",
         ),
         (
             "misc",
-            "ecc24f9ab8c25995017396f363987990d7421507532ee78da57cab0ca2e4b680",
+            "6b6faecc836354bd89472095c1fa746a",
         ),
     ],
 )

From 18e3ada5d8be7f60964a808c2caaadf3c5c5b400 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Fri, 14 Jun 2024 11:49:22 +0200
Subject: [PATCH 50/67] CI: re-enable all minimal requriements

---
 .github/workflows/test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e728034b..8411bb10 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -53,11 +53,11 @@ jobs:
 
     - name: Downgrade to minimum dependencies
       run: |
-        # pip install "audeer==2.0.0"
-        # pip install "audiofile>=0.4.0"
+        pip install "audeer==2.0.0"
+        pip install "audiofile>=0.4.0"
         pip install "pandas==2.2.0"
         pip install "pyarrow==10.0.1"
-        # pip install "pyyaml==5.4.1"
+        pip install "pyyaml==5.4.1"
       if: matrix.requirements == 'minimum'
 
     - name: Test with pytest

From bc0c68fb501020259cee71f2a99a2388684ff494 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Fri, 14 Jun 2024 14:37:48 +0200
Subject: [PATCH 51/67] Hashing algorithm to respect row order

---
 audformat/core/table.py | 133 +++++++++++++++++++++++++---------------
 tests/test_table.py     | 104 +++++++++++++++++++++++++++++--
 2 files changed, 184 insertions(+), 53 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index a78529cc..d0621c16 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1099,61 +1099,36 @@ def _save_csv(self, path: str):
             df.to_csv(fp, encoding="utf-8")
 
     def _save_parquet(self, path: str):
-        df = self.df.reset_index()
+        r"""Save table as PARQUET file.
 
-        table = pa.Table.from_pandas(df, preserve_index=False)
+        A PARQUET file is written in a non-deterministic way,
+        and we cannot track changes by its MD5 sum.
+        To make changes trackable,
+        we store a hash in its metadata.
 
-        # Add hash of dataframe
-        # to the metadata,
-        # which pyarrow stores inside the schema.
-        # See https://stackoverflow.com/a/58978449.
-        #
-        # This allows us to track if a PARQUET file changes over time.
-        # We cannot rely on md5 sums of the file,
-        # as the file is written in a non-deterministic way.
-        table_hash = hashlib.md5()
+        The hash is calculated from the pyarrow schema
+        (to track column names and data types)
+        and the pandas dataframes
+        (to track values and order or rows),
+        from which the PARQUET file is generated.
 
-        # Hash of schema (columns + dtypes)
-        schema_str = table.schema.to_string(
-            # schema.metadata contains pandas related information,
-            # and the used pyarrow and pandas version,
-            # and needs to be excluded
-            show_field_metadata=False,
-            show_schema_metadata=False,
-        )
-        schema_hash = hashlib.md5(schema_str.encode())
-        table_hash.update(schema_hash.digest())
+        The hash of the PARQUET can then be read by::
 
-        # Hash data
-        try:
-            data_hash = utils.hash(self.df)
-        except TypeError:
-            # Levels/columns with dtype "object" might not be hashable,
-            # e.g. when storing numpy arrays.
-            # We convert them to strings in this case.
-
-            # Index
-            df = self.df.copy()
-            update_index_dtypes = {
-                level: "string"
-                for level, dtype in self._levels_and_dtypes.items()
-                if dtype == define.DataType.OBJECT
-            }
-            df.index = utils.set_index_dtypes(df.index, update_index_dtypes)
-
-            # Columns
-            for column_id, column in self.columns.items():
-                if column.scheme_id is not None:
-                    scheme = self.db.schemes[column.scheme_id]
-                    if scheme.dtype == define.DataType.OBJECT:
-                        df[column_id] = df[column_id].astype("string")
-                else:
-                    # No scheme defaults to `object` dtype
-                    df[column_id] = df[column_id].astype("string")
-            data_hash = utils.hash(df)
+            pyarrow.parquet.read_schema(path).metadata[b"hash"].decode()
+
+        Args:
+            path: path, including file extension
 
-        table_hash.update(data_hash.encode())
+        """
+        table = pa.Table.from_pandas(self.df.reset_index(), preserve_index=False)
+
+        # Create hash of table
+        table_hash = hashlib.md5()
+        table_hash.update(_schema_hash(table))
+        table_hash.update(_dataframe_hash(self.df))
 
+        # Store in metadata of file,
+        # see https://stackoverflow.com/a/58978449
         metadata = {"hash": table_hash.hexdigest()}
         table = table.replace_schema_metadata({**metadata, **table.schema.metadata})
 
@@ -1855,6 +1830,46 @@ def _assert_table_index(
         )
 
 
+def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
+    """Hash a dataframe.
+
+    The hash value takes into account:
+
+    * index of dataframe
+    * values of the dataframe
+    * order of dataframe rows
+
+    It does not consider:
+
+    * column names of dataframe
+    * dtypes of dataframe
+
+    Args:
+        df: dataframe
+        max_rows: if not ``None``,
+            the maximum number of rows,
+            taken into account for hashing
+
+    Returns:
+        MD5 hash in bytes
+
+    """
+    # Idea for implementation from
+    # https://github.com/streamlit/streamlit/issues/7086#issuecomment-1654504410
+    md5 = hashlib.md5()
+    if max_rows is not None and len(df) > max_rows:  # pragma: nocover (not yet used)
+        df = df.sample(n=max_rows, random_state=0)
+        # Hash length, as we have to track if this changes
+        md5.update(str(len(df)).encode("utf-8"))
+    try:
+        md5.update(bytes(str(pd.util.hash_pandas_object(df)), "utf-8"))
+    except TypeError:
+        # Use pickle if pandas cannot hash the object,
+        # e.g. if it contains numpy.arrays.
+        md5.update(f"{pickle.dumps(df, pickle.HIGHEST_PROTOCOL)}".encode("utf-8"))
+    return md5.digest()
+
+
 def _maybe_convert_dtype_to_string(
     index: pd.Index,
 ) -> pd.Index:
@@ -1877,3 +1892,23 @@ def _maybe_update_scheme(
         for scheme in table.db.schemes.values():
             if table._id == scheme.labels:
                 scheme.replace_labels(table._id)
+
+
+def _schema_hash(table: pa.Table) -> bytes:
+    r"""Hash pyarrow table schema.
+
+    Args:
+        table: pyarrow table
+
+    Returns:
+        MD5 hash in bytes
+
+    """
+    schema_str = table.schema.to_string(
+        # schema.metadata contains pandas related information,
+        # and the used pyarrow and pandas version,
+        # and needs to be excluded
+        show_field_metadata=False,
+        show_schema_metadata=False,
+    )
+    return hashlib.md5(schema_str.encode()).digest()
diff --git a/tests/test_table.py b/tests/test_table.py
index 57539e00..4b367660 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1210,24 +1210,120 @@ def test_map(table, map):
     pd.testing.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("storage_format", ["csv", "parquet"])
+def test_hash(tmpdir, storage_format):
+    r"""Test if PARQUET file hash changes with table.
+
+    We store a MD5 sum associated with the dataframe,
+    that was used to create the file,
+    in the metadata of the PARQUET file.
+    Those MD5 sum is supposed to change,
+    if any of the table rows, (index) columns changes,
+    the data type of the entries changes,
+    or the name of a column changes.
+
+    Args:
+        tmpdir: tmpdir fixture
+        storage_format: storage format of table file
+
+    """
+
+    def get_md5(path: str) -> str:
+        r"""Get MD5 sum for table file."""
+        ext = audeer.file_extension(path)
+        if ext == "csv":
+            md5 = audeer.md5(path)
+        elif ext == "parquet":
+            md5 = parquet.read_schema(path).metadata[b"hash"].decode()
+        return md5
+
+    db_root = audeer.path(tmpdir, "db")
+    db = audformat.Database("mydb")
+    db.schemes["int"] = audformat.Scheme("int")
+    index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2])
+    db["table"] = audformat.Table(index)
+    db["table"]["column"] = audformat.Column(scheme_id="int")
+    db["table"]["column"].set([0, 1])
+    db.save(db_root, storage_format=storage_format)
+
+    table_file = audeer.path(db_root, f"db.table.{storage_format}")
+    assert os.path.exists(table_file)
+    md5 = get_md5(table_file)
+
+    # Replace table with identical copy
+    table = db["table"].copy()
+    db["table"] = table
+    db.save(db_root, storage_format=storage_format)
+    assert get_md5(table_file) == md5
+
+    # Change order of rows
+    index = audformat.segmented_index(["f2", "f1"], [1, 0], [2, 1])
+    db["table"] = audformat.Table(index)
+    db["table"]["column"] = audformat.Column(scheme_id="int")
+    db["table"]["column"].set([1, 0])
+    db.save(db_root, storage_format=storage_format)
+    assert get_md5(table_file) != md5
+
+    # Change index entry
+    index = audformat.segmented_index(["f1", "f1"], [0, 1], [1, 2])
+    db["table"] = audformat.Table(index)
+    db["table"]["column"] = audformat.Column(scheme_id="int")
+    db["table"]["column"].set([0, 1])
+    db.save(db_root, storage_format=storage_format)
+    assert get_md5(table_file) != md5
+
+    # Change data entry
+    index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2])
+    db["table"] = audformat.Table(index)
+    db["table"]["column"] = audformat.Column(scheme_id="int")
+    db["table"]["column"].set([1, 0])
+    db.save(db_root, storage_format=storage_format)
+    assert get_md5(table_file) != md5
+
+    # Change column name
+    index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2])
+    db["table"] = audformat.Table(index)
+    db["table"]["col"] = audformat.Column(scheme_id="int")
+    db["table"]["col"].set([0, 1])
+    db.save(db_root, storage_format=storage_format)
+    assert get_md5(table_file) != md5
+
+    # Change order of columns
+    index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2])
+    db["table"] = audformat.Table(index)
+    db["table"]["col1"] = audformat.Column(scheme_id="int")
+    db["table"]["col1"].set([0, 1])
+    db["table"]["col2"] = audformat.Column(scheme_id="int")
+    db["table"]["col2"].set([0, 1])
+    db.save(db_root, storage_format=storage_format)
+    md5 = get_md5(table_file)
+    db["table"] = audformat.Table(index)
+    db["table"]["col2"] = audformat.Column(scheme_id="int")
+    db["table"]["col2"].set([0, 1])
+    db["table"]["col1"] = audformat.Column(scheme_id="int")
+    db["table"]["col1"].set([0, 1])
+    db.save(db_root, storage_format=storage_format)
+    assert get_md5(table_file) != md5
+
+
 @pytest.mark.parametrize(
     "table_id, expected_hash",
     [
         (
             "files",
-            "4d0295654694751bdcd12be86b89b73e",
+            "9caa6722e65a04ddbce1cda2238c9126",
         ),
         (
             "segments",
-            "d2a9b84d03abde24ae84cf647a019b71",
+            "37c9d9dc4f937a6e97ec72a080055e49",
         ),
         (
             "misc",
-            "6b6faecc836354bd89472095c1fa746a",
+            "3488c007d45b19e04e8fdbf000f0f04d",
         ),
     ],
 )
-def test_parquet_reproducibility(tmpdir, table_id, expected_hash):
+def test_parquet_hash_reproducibility(tmpdir, table_id, expected_hash):
     r"""Test reproducibility of binary PARQUET files.
 
     When storing the same dataframe

From 6c36e0aec9b45b5ab2eeccda58ffd87c87c26301 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Fri, 14 Jun 2024 14:49:21 +0200
Subject: [PATCH 52/67] Clean up tests

---
 tests/test_table.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/test_table.py b/tests/test_table.py
index 4b367660..49ae61a6 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1342,9 +1342,6 @@ def test_parquet_hash_reproducibility(tmpdir, table_id, expected_hash):
     random.seed(1)  # ensure the same random table values are created
     db = audformat.testing.create_db()
 
-    # Check that the output of audfromat.utils.hash() does not change
-    # assert audformat.utils.hash(db[table_id].df) == expected_hash
-
     # Write to PARQUET file and check if correct hash is stored
     path_wo_ext = audeer.path(tmpdir, table_id)
     path = f"{path_wo_ext}.parquet"

From 407aa912d93bfa89fe15819c31cf02183b29f310 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 18 Jun 2024 11:27:48 +0200
Subject: [PATCH 53/67] Fix minimum install of audiofile

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8411bb10..210a685f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -54,7 +54,7 @@ jobs:
     - name: Downgrade to minimum dependencies
       run: |
         pip install "audeer==2.0.0"
-        pip install "audiofile>=0.4.0"
+        pip install "audiofile==0.4.0"
         pip install "pandas==2.2.0"
         pip install "pyarrow==10.0.1"
         pip install "pyyaml==5.4.1"

From c9b576046774bdf47fca3a7c3e3d2dc4607df5ba Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 18 Jun 2024 11:34:49 +0200
Subject: [PATCH 54/67] Fix docstring of Table.load()

---
 audformat/core/table.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index d0621c16..bc66b943 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -444,10 +444,10 @@ def load(
     ):
         r"""Load table data from disk.
 
-        Tables can be stored as PKL and/or CSV files to disk.
-        If both files are present
+        Tables are stored as CSV, PARQUET and/or PKL files to disk.
+        If the PKL file exists,
         it will load the PKL file
-        as long as its modification date is newer,
+        as long as its modification date is the newest,
         otherwise it will raise an error
         and ask to delete one of the files.
 
@@ -456,7 +456,7 @@ def load(
 
         Raises:
             RuntimeError: if table file(s) are missing
-            RuntimeError: if CSV file is newer than PKL file
+            RuntimeError: if CSV or PARQUET file is newer than PKL file
 
         """
         path = audeer.path(path)

From 589da4b988ae063146035fc892e72dd44a0cf0ea Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 18 Jun 2024 11:36:25 +0200
Subject: [PATCH 55/67] Fix docstring of Database.load()

---
 audformat/core/database.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/audformat/core/database.py b/audformat/core/database.py
index 0a84447a..2772f0a4 100644
--- a/audformat/core/database.py
+++ b/audformat/core/database.py
@@ -1383,7 +1383,7 @@ def load(
         r"""Load database from disk.
 
         Expects a header ``<root>/<name>.yaml``
-        and for every table a file ``<root>/<name>.<table-id>.[csv|pkl]``
+        and for every table a file ``<root>/<name>.<table-id>.[csv|parquet|pkl]``
         Media files should be located under ``root``.
 
         Args:
@@ -1409,7 +1409,7 @@ def load(
         Raises:
             FileNotFoundError: if the database header file cannot be found
                 under ``root``
-            RuntimeError: if a CSV table file is newer
+            RuntimeError: if a CSV or PARQUET table file is newer
                 than the corresponding PKL file
 
         """

From b0ee769975a683d0aafdbc035de2b29e72b70e02 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 18 Jun 2024 13:17:20 +0200
Subject: [PATCH 56/67] Ensure correct order in time when storing tables

---
 audformat/core/table.py | 17 ++++++---
 tests/test_table.py     | 79 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index bc66b943..995e3bfc 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -599,25 +599,34 @@ def save(
         path = audeer.path(path)
         define.TableStorageFormat._assert_has_attribute_value(storage_format)
 
+        csv_file = f"{path}.{define.TableStorageFormat.CSV}"
         parquet_file = f"{path}.{define.TableStorageFormat.PARQUET}"
         pickle_file = f"{path}.{define.TableStorageFormat.PICKLE}"
-        csv_file = f"{path}.{define.TableStorageFormat.CSV}"
 
-        # Make sure the CSV|PARQUET file is always written first
-        # as it is expected to be older by load()
+        # Ensure the following storage order:
+        # 1. PARQUET file
+        # 2. CSV file
+        # 3. PKL file
+        # The PKl is expected to be the oldest by load(),
+        # the order of PARQUET and CSV file
+        # is only a convention for now.
         if storage_format == define.TableStorageFormat.PICKLE:
             if update_other_formats and os.path.exists(parquet_file):
                 self._save_parquet(parquet_file)
-            elif update_other_formats and os.path.exists(csv_file):
+            if update_other_formats and os.path.exists(csv_file):
                 self._save_csv(csv_file)
             self._save_pickled(pickle_file)
 
         if storage_format == define.TableStorageFormat.PARQUET:
             self._save_parquet(parquet_file)
+            if update_other_formats and os.path.exists(csv_file):
+                self._save_csv(csv_file)
             if update_other_formats and os.path.exists(pickle_file):
                 self._save_pickled(pickle_file)
 
         if storage_format == define.TableStorageFormat.CSV:
+            if update_other_formats and os.path.exists(parquet_file):
+                self._save_parquet(parquet_file)
             self._save_csv(csv_file)
             if update_other_formats and os.path.exists(pickle_file):
                 self._save_pickled(pickle_file)
diff --git a/tests/test_table.py b/tests/test_table.py
index 49ae61a6..c2fdcadb 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1,6 +1,7 @@
 import os
 import random
 import re
+import time
 import typing
 
 import numpy as np
@@ -2122,3 +2123,81 @@ def test_update(table, overwrite, others):
         for column_id, column in other.columns.items():
             assert column.scheme == table[column_id].scheme
             assert column.rater == table[column_id].rater
+
+
+@pytest.mark.parametrize("update_other_formats", [True, False])
+@pytest.mark.parametrize(
+    "storage_format, existing_formats",
+    [
+        ("csv", []),
+        ("csv", []),
+        ("csv", ["pkl"]),
+        ("csv", ["parquet", "pkl"]),
+        ("pkl", ["parquet"]),
+        ("pkl", ["csv"]),
+        ("pkl", ["parquet", "csv"]),
+        ("parquet", ["pkl"]),
+        ("parquet", ["csv"]),
+        ("parquet", ["pkl", "csv"]),
+    ],
+)
+def test_update_other_formats(
+    tmpdir,
+    storage_format,
+    existing_formats,
+    update_other_formats,
+):
+    r"""Tests updating of other table formats.
+
+    When a table is stored with `audformat.Table.save()`
+    as CSV, PARQUET, or PKL file,
+    a user might select
+    that all other existing file representations of the table
+    are updated as well.
+    E.g. if a PKL file of the same table exists,
+    and a user saves to a CSV file
+    with the argument `update_other_formats=True`,
+    it should write the table to the CSV and PKL file.
+
+    """
+    db = audformat.testing.create_db()
+
+    table_id = "files"
+    table_file = audeer.path(tmpdir, "table")
+
+    # Create existing table files and pause for a short time
+    old_mtime = {}
+    for ext in existing_formats:
+        db[table_id].save(
+            table_file,
+            storage_format=ext,
+            update_other_formats=False,
+        )
+        old_mtime[ext] = os.path.getmtime(f"{table_file}.{ext}")
+    time.sleep(0.05)
+
+    # Store table to requested format
+    db[table_id].save(
+        table_file,
+        storage_format=storage_format,
+        update_other_formats=update_other_formats,
+    )
+
+    # Collect mtimes of existing table files
+    mtime = {}
+    formats = existing_formats + [storage_format]
+    for ext in formats:
+        mtime[ext] = os.path.getmtime(f"{table_file}.{ext}")
+
+    # Ensure mtimes are correct
+    if update_other_formats:
+        if "pickle" in formats and "csv" in formats:
+            assert mtime["pickle"] > mtime["csv"]
+        if "pickle" in formats and "parquet" in formats:
+            assert mtime["pickle"] > mtime["parquet"]
+        if "csv" in formats and "parquet" in formats:
+            assert mtime["csv"] > mtime["parquet"]
+    else:
+        for ext in existing_formats:
+            assert mtime[ext] == old_mtime[ext]
+            assert mtime[storage_format] > old_mtime[ext]

From 1e167c13a39666b2f64508bc36808411fc87bf1a Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 18 Jun 2024 13:19:46 +0200
Subject: [PATCH 57/67] Simplify comment

---
 audformat/core/table.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 995e3bfc..094779d3 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -847,9 +847,6 @@ def _levels_and_dtypes(self) -> typing.Dict[str, str]:
         # The returned dictionary is used
         # to infer index column names and dtypes
         # when reading CSV files.
-        # This means the names and dtypes cannot be inferred
-        # from the index itself,
-        # but need to be known before.
         raise NotImplementedError()  # pragma: no cover
 
     def _load_csv(self, path: str):

From 8ad8d742ed64a1218f4e878a4daf779694447b36 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 18 Jun 2024 13:21:18 +0200
Subject: [PATCH 58/67] Add docstring to _load_pickle()

---
 audformat/core/table.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 094779d3..20cbbc29 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -896,6 +896,14 @@ def _load_parquet(self, path: str):
         self._df = df
 
     def _load_pickled(self, path: str):
+        r"""Load table from PKL file.
+
+        The loaded table is stored under ``self._df``.
+
+        Args:
+            path: path to table, including file extension
+
+        """
         # Older versions of audformat used xz compression
         # which produced smaller files,
         # but was slower.

From 7b3a55801f9f24a0475dc0f7fd8c38e8c27306fd Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 18 Jun 2024 13:23:58 +0200
Subject: [PATCH 59/67] Fix _save_parquet() docstring

---
 audformat/core/table.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 20cbbc29..c1a4c221 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1122,11 +1122,11 @@ def _save_parquet(self, path: str):
 
         The hash is calculated from the pyarrow schema
         (to track column names and data types)
-        and the pandas dataframes
+        and the pandas dataframe
         (to track values and order or rows),
         from which the PARQUET file is generated.
 
-        The hash of the PARQUET can then be read by::
+        The hash of the PARQUET file can then be read by::
 
             pyarrow.parquet.read_schema(path).metadata[b"hash"].decode()
 

From d414fe7ee8c073f4e7b19ca0cc0586bceae8a512 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 18 Jun 2024 13:25:44 +0200
Subject: [PATCH 60/67] Improve comment in _dataframe_hash()

---
 audformat/core/table.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index c1a4c221..d3732660 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1873,7 +1873,7 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
     md5 = hashlib.md5()
     if max_rows is not None and len(df) > max_rows:  # pragma: nocover (not yet used)
         df = df.sample(n=max_rows, random_state=0)
-        # Hash length, as we have to track if this changes
+        # Hash length of dataframe, as we have to track if this changes
         md5.update(str(len(df)).encode("utf-8"))
     try:
         md5.update(bytes(str(pd.util.hash_pandas_object(df)), "utf-8"))

From a90eaf4d54236735281347d5ba436e5444f1c0cc Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 18 Jun 2024 13:28:18 +0200
Subject: [PATCH 61/67] Document arguments of test_table_update...

---
 tests/test_table.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_table.py b/tests/test_table.py
index c2fdcadb..e3975d1b 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -2159,6 +2159,14 @@ def test_update_other_formats(
     with the argument `update_other_formats=True`,
     it should write the table to the CSV and PKL file.
 
+    Args:
+        tmpdir: tmpdir fixture
+        storage_format: storage format of table
+        existing_formats: formats the table should be stored in
+            before saving to ``storage_format``
+        update_other_formats: if tables specified in ``existing_formats``
+            should be updated when saving ``storage_format``
+
     """
     db = audformat.testing.create_db()
 

From 2749ef9e7794f0358a9946e54822e78c5a6663eb Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Tue, 18 Jun 2024 13:35:36 +0200
Subject: [PATCH 62/67] Relax test for table saving order

---
 tests/test_table.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_table.py b/tests/test_table.py
index e3975d1b..5900d7f3 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -2200,11 +2200,11 @@ def test_update_other_formats(
     # Ensure mtimes are correct
     if update_other_formats:
         if "pickle" in formats and "csv" in formats:
-            assert mtime["pickle"] > mtime["csv"]
+            assert mtime["pickle"] >= mtime["csv"]
         if "pickle" in formats and "parquet" in formats:
-            assert mtime["pickle"] > mtime["parquet"]
+            assert mtime["pickle"] >= mtime["parquet"]
         if "csv" in formats and "parquet" in formats:
-            assert mtime["csv"] > mtime["parquet"]
+            assert mtime["csv"] >= mtime["parquet"]
     else:
         for ext in existing_formats:
             assert mtime[ext] == old_mtime[ext]

From 3f21e3c41ae42cf8c37d01175bc82a5ea0b5fbea Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 08:52:13 +0200
Subject: [PATCH 63/67] Update audformat/core/table.py

Co-authored-by: ChristianGeng <christian.c.geng@gmail.com>
---
 audformat/core/table.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index d3732660..3565ab4a 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -935,7 +935,6 @@ def _pyarrow_convert_dtypes(
         convert_all: bool = False,
     ) -> pd.DataFrame:
         r"""Convert dtypes that are not handled by pyarrow.
-
         This adjusts dtypes in a dataframe,
         that could not be set correctly
         when converting to the dataframe

From 2912f76f38052b63ad1b127787416c66c2cd3781 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 08:54:05 +0200
Subject: [PATCH 64/67] Revert "Update audformat/core/table.py"

This reverts commit 3f21e3c41ae42cf8c37d01175bc82a5ea0b5fbea.
---
 audformat/core/table.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/audformat/core/table.py b/audformat/core/table.py
index 3565ab4a..d3732660 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -935,6 +935,7 @@ def _pyarrow_convert_dtypes(
         convert_all: bool = False,
     ) -> pd.DataFrame:
         r"""Convert dtypes that are not handled by pyarrow.
+
         This adjusts dtypes in a dataframe,
         that could not be set correctly
         when converting to the dataframe

From c4c41ff0aace45f14e4e076e2806cc6a5a0a2b50 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 16:23:37 +0200
Subject: [PATCH 65/67] Use numpy representation for hashing (#436)

* Use numpy representation for hashing

* Enable tests and require pandas>=1.4.1

* Use numpy<2.0 in minimum test

* Skip doctests in minimum

* Require pandas>=2.1.0

* Require numpy<=2.0.0 in minimum test

* Remove print statements

* Fix numpy<2.0.0 for minimum test

* Remove max_rows argument

* Simplify code
---
 .github/workflows/test.yml |  5 +++--
 audformat/core/table.py    | 26 ++++++++++----------------
 pyproject.toml             |  4 ++--
 tests/test_table.py        |  6 +++---
 4 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 210a685f..f895bbf6 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ main ]
   pull_request:
-    branches: [ main ]
+    branches: [ main, pyarrow ]
 
 jobs:
   build:
@@ -55,7 +55,8 @@ jobs:
       run: |
         pip install "audeer==2.0.0"
         pip install "audiofile==0.4.0"
-        pip install "pandas==2.2.0"
+        pip install "numpy<2.0.0"
+        pip install "pandas==2.1.0"
         pip install "pyarrow==10.0.1"
         pip install "pyyaml==5.4.1"
       if: matrix.requirements == 'minimum'
diff --git a/audformat/core/table.py b/audformat/core/table.py
index d3732660..30924953 100644
--- a/audformat/core/table.py
+++ b/audformat/core/table.py
@@ -1844,7 +1844,7 @@ def _assert_table_index(
         )
 
 
-def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
+def _dataframe_hash(df: pd.DataFrame) -> bytes:
     """Hash a dataframe.
 
     The hash value takes into account:
@@ -1860,27 +1860,21 @@ def _dataframe_hash(df: pd.DataFrame, max_rows: int = None) -> bytes:
 
     Args:
         df: dataframe
-        max_rows: if not ``None``,
-            the maximum number of rows,
-            taken into account for hashing
 
     Returns:
         MD5 hash in bytes
 
     """
-    # Idea for implementation from
-    # https://github.com/streamlit/streamlit/issues/7086#issuecomment-1654504410
     md5 = hashlib.md5()
-    if max_rows is not None and len(df) > max_rows:  # pragma: nocover (not yet used)
-        df = df.sample(n=max_rows, random_state=0)
-        # Hash length of dataframe, as we have to track if this changes
-        md5.update(str(len(df)).encode("utf-8"))
-    try:
-        md5.update(bytes(str(pd.util.hash_pandas_object(df)), "utf-8"))
-    except TypeError:
-        # Use pickle if pandas cannot hash the object,
-        # e.g. if it contains numpy.arrays.
-        md5.update(f"{pickle.dumps(df, pickle.HIGHEST_PROTOCOL)}".encode("utf-8"))
+    for _, y in df.reset_index().items():
+        # Convert every column to a numpy array,
+        # and hash its string representation
+        if y.dtype == "Int64":
+            # Enforce consistent conversion to numpy.array
+            # for integers across different pandas versions
+            # (since pandas 2.2.x, Int64 is converted to float if it contains <NA>)
+            y = y.astype("float")
+        md5.update(bytes(str(y.to_numpy()), "utf-8"))
     return md5.digest()
 
 
diff --git a/pyproject.toml b/pyproject.toml
index b0f45140..13c329eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,14 +28,14 @@ classifiers = [
     'Programming Language :: Python :: 3.11',
     'Topic :: Scientific/Engineering',
 ]
-requires-python = '>=3.9'  # pandas >=2.2.0
+requires-python = '>=3.9'  # pandas >=2.1.0
 dependencies = [
     'audeer >=2.0.0',
     'audiofile >=0.4.0',
     'iso-639',
     'iso3166',
     'oyaml',
-    'pandas >=2.2.0',  # hash values, see https://github.com/pandas-dev/pandas/issues/58999
+    'pandas >=2.1.0',  # for pyarrow -> timedelta conversion
     'pyarrow >=10.0.1',  # for pyarrow strings in pandas
     'pyyaml >=5.4.1',
 ]
diff --git a/tests/test_table.py b/tests/test_table.py
index 5900d7f3..1af6bd85 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1312,15 +1312,15 @@ def get_md5(path: str) -> str:
     [
         (
             "files",
-            "9caa6722e65a04ddbce1cda2238c9126",
+            "a66a22ee4158e0e5100f1d797155ad81",
         ),
         (
             "segments",
-            "37c9d9dc4f937a6e97ec72a080055e49",
+            "f69eb4a5d19da71e5da00a9b13beb3db",
         ),
         (
             "misc",
-            "3488c007d45b19e04e8fdbf000f0f04d",
+            "331f79758b195cb9b7d0e8889e830eb2",
         ),
     ],
 )

From 8e85168c73db7b9c5cdca69b95b592fc922dac22 Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 17:12:36 +0200
Subject: [PATCH 66/67] Use test class

---
 tests/test_table.py | 168 ++++++++++++++++++++++++--------------------
 1 file changed, 92 insertions(+), 76 deletions(-)

diff --git a/tests/test_table.py b/tests/test_table.py
index 1af6bd85..2b5536de 100644
--- a/tests/test_table.py
+++ b/tests/test_table.py
@@ -1212,7 +1212,7 @@ def test_map(table, map):
 
 
 @pytest.mark.parametrize("storage_format", ["csv", "parquet"])
-def test_hash(tmpdir, storage_format):
+class TestHash:
     r"""Test if PARQUET file hash changes with table.
 
     We store a MD5 sum associated with the dataframe,
@@ -1229,82 +1229,98 @@ def test_hash(tmpdir, storage_format):
 
     """
 
-    def get_md5(path: str) -> str:
+    def db(self, tmpdir, storage_format):
+        r"""Create minimal database with scheme and table."""
+        self.db_root = audeer.path(tmpdir, "db")
+        self.storage_format = storage_format
+        self.table_file = audeer.path(self.db_root, f"db.table.{storage_format}")
+        db = audformat.Database("mydb")
+        db.schemes["int"] = audformat.Scheme("int")
+        index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2])
+        db["table"] = audformat.Table(index)
+        db["table"]["column"] = audformat.Column(scheme_id="int")
+        db["table"]["column"].set([0, 1])
+        db.save(self.db_root, storage_format=self.storage_format)
+        return db
+
+    def md5(self) -> str:
         r"""Get MD5 sum for table file."""
-        ext = audeer.file_extension(path)
-        if ext == "csv":
-            md5 = audeer.md5(path)
-        elif ext == "parquet":
-            md5 = parquet.read_schema(path).metadata[b"hash"].decode()
-        return md5
-
-    db_root = audeer.path(tmpdir, "db")
-    db = audformat.Database("mydb")
-    db.schemes["int"] = audformat.Scheme("int")
-    index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2])
-    db["table"] = audformat.Table(index)
-    db["table"]["column"] = audformat.Column(scheme_id="int")
-    db["table"]["column"].set([0, 1])
-    db.save(db_root, storage_format=storage_format)
-
-    table_file = audeer.path(db_root, f"db.table.{storage_format}")
-    assert os.path.exists(table_file)
-    md5 = get_md5(table_file)
-
-    # Replace table with identical copy
-    table = db["table"].copy()
-    db["table"] = table
-    db.save(db_root, storage_format=storage_format)
-    assert get_md5(table_file) == md5
-
-    # Change order of rows
-    index = audformat.segmented_index(["f2", "f1"], [1, 0], [2, 1])
-    db["table"] = audformat.Table(index)
-    db["table"]["column"] = audformat.Column(scheme_id="int")
-    db["table"]["column"].set([1, 0])
-    db.save(db_root, storage_format=storage_format)
-    assert get_md5(table_file) != md5
-
-    # Change index entry
-    index = audformat.segmented_index(["f1", "f1"], [0, 1], [1, 2])
-    db["table"] = audformat.Table(index)
-    db["table"]["column"] = audformat.Column(scheme_id="int")
-    db["table"]["column"].set([0, 1])
-    db.save(db_root, storage_format=storage_format)
-    assert get_md5(table_file) != md5
-
-    # Change data entry
-    index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2])
-    db["table"] = audformat.Table(index)
-    db["table"]["column"] = audformat.Column(scheme_id="int")
-    db["table"]["column"].set([1, 0])
-    db.save(db_root, storage_format=storage_format)
-    assert get_md5(table_file) != md5
-
-    # Change column name
-    index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2])
-    db["table"] = audformat.Table(index)
-    db["table"]["col"] = audformat.Column(scheme_id="int")
-    db["table"]["col"].set([0, 1])
-    db.save(db_root, storage_format=storage_format)
-    assert get_md5(table_file) != md5
-
-    # Change order of columns
-    index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2])
-    db["table"] = audformat.Table(index)
-    db["table"]["col1"] = audformat.Column(scheme_id="int")
-    db["table"]["col1"].set([0, 1])
-    db["table"]["col2"] = audformat.Column(scheme_id="int")
-    db["table"]["col2"].set([0, 1])
-    db.save(db_root, storage_format=storage_format)
-    md5 = get_md5(table_file)
-    db["table"] = audformat.Table(index)
-    db["table"]["col2"] = audformat.Column(scheme_id="int")
-    db["table"]["col2"].set([0, 1])
-    db["table"]["col1"] = audformat.Column(scheme_id="int")
-    db["table"]["col1"].set([0, 1])
-    db.save(db_root, storage_format=storage_format)
-    assert get_md5(table_file) != md5
+        if self.storage_format == "csv":
+            return audeer.md5(self.table_file)
+        elif self.storage_format == "parquet":
+            return parquet.read_schema(self.table_file).metadata[b"hash"].decode()
+
+    def test_change_index(self, tmpdir, storage_format):
+        r"""Change table index."""
+        db = self.db(tmpdir, storage_format)
+        md5 = self.md5()
+        index = audformat.segmented_index(["f1", "f1"], [0, 1], [1, 2])
+        db["table"] = audformat.Table(index)
+        db["table"]["column"] = audformat.Column(scheme_id="int")
+        db["table"]["column"].set([0, 1])
+        db.save(self.db_root, storage_format=self.storage_format)
+        assert self.md5() != md5
+
+    def test_change_column_name(self, tmpdir, storage_format):
+        r"""Change table column name."""
+        db = self.db(tmpdir, storage_format)
+        md5 = self.md5()
+        index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2])
+        db["table"] = audformat.Table(index)
+        db["table"]["col"] = audformat.Column(scheme_id="int")
+        db["table"]["col"].set([0, 1])
+        db.save(self.db_root, storage_format=self.storage_format)
+        assert self.md5() != md5
+
+    def test_change_column_order(self, tmpdir, storage_format):
+        r"""Change order of table columns."""
+        db = self.db(tmpdir, storage_format)
+        index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2])
+        db["table"] = audformat.Table(index)
+        db["table"]["col1"] = audformat.Column(scheme_id="int")
+        db["table"]["col1"].set([0, 1])
+        db["table"]["col2"] = audformat.Column(scheme_id="int")
+        db["table"]["col2"].set([0, 1])
+        db.save(self.db_root, storage_format=self.storage_format)
+        md5 = self.md5()
+        db["table"] = audformat.Table(index)
+        db["table"]["col2"] = audformat.Column(scheme_id="int")
+        db["table"]["col2"].set([0, 1])
+        db["table"]["col1"] = audformat.Column(scheme_id="int")
+        db["table"]["col1"].set([0, 1])
+        db.save(self.db_root, storage_format=self.storage_format)
+        assert self.md5() != md5
+
+    def test_change_row_order(self, tmpdir, storage_format):
+        r"""Change order of table rows."""
+        db = self.db(tmpdir, storage_format)
+        md5 = self.md5()
+        index = audformat.segmented_index(["f2", "f1"], [1, 0], [2, 1])
+        db["table"] = audformat.Table(index)
+        db["table"]["column"] = audformat.Column(scheme_id="int")
+        db["table"]["column"].set([1, 0])
+        db.save(self.db_root, storage_format=storage_format)
+        assert self.md5() != md5
+
+    def test_change_values(self, tmpdir, storage_format):
+        r"""Change table values."""
+        db = self.db(tmpdir, storage_format)
+        md5 = self.md5()
+        index = audformat.segmented_index(["f1", "f2"], [0, 1], [1, 2])
+        db["table"] = audformat.Table(index)
+        db["table"]["column"] = audformat.Column(scheme_id="int")
+        db["table"]["column"].set([1, 0])
+        db.save(self.db_root, storage_format=self.storage_format)
+        assert self.md5() != md5
+
+    def test_copy_table(self, tmpdir, storage_format):
+        r"""Replace table with identical copy."""
+        db = self.db(tmpdir, storage_format)
+        md5 = self.md5()
+        table = db["table"].copy()
+        db["table"] = table
+        db.save(self.db_root, storage_format=self.storage_format)
+        assert self.md5() == md5
 
 
 @pytest.mark.parametrize(

From 6a9e3d10dd7697eb8e4a40312404a97755326d5e Mon Sep 17 00:00:00 2001
From: Hagen Wierstorf <hwierstorf@audeering.com>
Date: Wed, 19 Jun 2024 17:14:50 +0200
Subject: [PATCH 67/67] CI: remove pyarrow from branch to start test

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index f895bbf6..9473ffc4 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ main ]
   pull_request:
-    branches: [ main, pyarrow ]
+    branches: [ main ]
 
 jobs:
   build: